Protein: uniprot, 2023-02

import pandas as pd
def _get_shortest_name(df: pd.DataFrame, column: str, new_column="name"):
    """Get a single shortest name from a column of lists.

    Everyone else became synonyms.
    """
    synonyms_list = []
    name_list = []
    for lst in df[column]:

        def shortest_name(lst: list):
            return min(lst, key=len)

        synonyms = set(lst.split(", "))
        no_space_names = [i for i in synonyms if " " not in i]
        if len(no_space_names) == 0:
            name = shortest_name(synonyms)
        else:
            name = shortest_name(no_space_names)
        name_list.append(name)
        synonyms.remove(name)
        synonyms_list.append("|".join([i for i in synonyms]))

    df[new_column] = name_list
    df[column] = synonyms_list

Files are downloaded from: https://www.uniprot.org/uniprotkb

# Downloaded from 2022-09-26

filepaths = {
    "human": "https://bionty-assets.s3.amazonaws.com/uniprot-human-2023-02.tsv.gz",
    "mouse": "https://bionty-assets.s3.amazonaws.com/uniprot-mouse-2023-02.tsv.gz",
}

Human

Curate the tables

for species, filepath in filepaths.items():
    print(f"Loading {species} data...")

    df = pd.read_csv(filepath, sep="\t")

    print(f"shape: {df.shape}")
    display(df.head())

    df = df.rename(
        columns={
            "Entry": "uniprotkb_id",
            "Protein names": "synonyms",
            "Length": "length",
            "Gene Names (primary)": "gene_symbol",
            "GeneID": "ncbi_gene_ids",
        }
    )

    # concatenate ncbi gene ids with |
    df["ncbi_gene_ids"] = df["ncbi_gene_ids"].fillna("")
    df["ncbi_gene_ids"] = df["ncbi_gene_ids"].str.rstrip(";").str.replace(";", "|")

    # pick the shortest name from synonyms as name
    # concatenate the rest synonyms with |
    df["synonyms"] = df["synonyms"].fillna("")
    _get_shortest_name(df, "synonyms")
    df = df[
        ["uniprotkb_id", "name", "length", "synonyms", "gene_symbol", "ncbi_gene_ids"]
    ]

    # sort by uniprotkb id, reset index
    df = df[~df["uniprotkb_id"].isnull()]
    df = df.sort_values("uniprotkb_id").reset_index(drop=True)

    print(f"shape: {df.shape}, unique: {df.uniprotkb_id.is_unique}")
    display(df.head())

    filename = f"df_{species}__uniprot__2023-02__Protein.parquet"
    df.to_parquet(filename)

    print(f"Wrote {filename}.")
    print("------------------------------------------------")
Loading human data...
shape: (207780, 5)
Entry Protein names Length Gene Names (primary) GeneID
0 A0A024QZ08 Intraflagellar transport 20 homolog (Chlamydom... 132 IFT20 90410;
1 A0A024QZ26 Histone deacetylase 6, isoform CRA_c 1215 HDAC6 10013;
2 A0A024QZ86 T-box 2, isoform CRA_a 712 TBX2 6909;
3 A0A024QZA8 receptor protein-tyrosine kinase (EC 2.7.10.1) 976 EPHA2 1969;
4 A0A024QZB8 Battenin 438 CLN3 1201;
shape: (207780, 6), unique: True
uniprotkb_id name length synonyms gene_symbol ncbi_gene_ids
0 A0A023HJ61 HRES-1/RAB4 variant 121 RAB4A
1 A0A023HN28 SRSF3/USP6 fusion protein 16 NaN
2 A0A023I7F4 Cytochrome b 380 CYTB
3 A0A023I7H2 NADH-ubiquinone oxidoreductase chain 5 (EC 7.1... 603 ND5
4 A0A023I7H5 ATP synthase subunit a 226 ATP6
Wrote human_uniprot_2023-02_Protein_lookup.parquet.
------------------------------------------------
Loading mouse data...
shape: (86411, 5)
Entry Protein names Length Gene Names (primary) GeneID
0 A0A075F5C6 Heat shock factor 1 (Heat shock transcription ... 531 Hsf1 15499;
1 A0A087WPF7 Autism susceptibility gene 2 protein homolog 1261 Auts2 NaN
2 A0A087WPU4 FAT atypical cadherin 1 159 Fat1 NaN
3 A0A087WRK1 Predicted gene, 20814 (Predicted gene, 20855) ... 222 Gm20905 100042201;100042279;108167378;108168553;108168...
4 A0A087WRT4 FAT atypical cadherin 1 4602 Fat1 14107;
shape: (86411, 6), unique: True
uniprotkb_id name length synonyms gene_symbol ncbi_gene_ids
0 A0A023JDV8 Creatine transporter SLC6A8 variant D 224 Slc6a8
1 A0A023NCR8 Cytochrome b (Complex III subunit 3) (Complex ... 233 cytB
2 A0A023NCS0 Cytochrome b (Complex III subunit 3) (Complex ... 222 cytB
3 A0A023ND59 Cytochrome b (Complex III subunit 3) (Complex ... 227 cytB
4 A0A023NDP0 Cytochrome b (Complex III subunit 3) (Complex ... 242 cytB
Wrote mouse_uniprot_2023-02_Protein_lookup.parquet.
------------------------------------------------