Organism: ncbitaxon, 2023-06-20

import bionty as bt
onto = bt.Ontology(
    "/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/ontology_all__ncbitaxon__2023-06-20__Organism"
)
onto
Ontology('/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/ontology_all__ncbitaxon__2023-06-20__Organism', timeout=100)
term = onto["NCBITaxon:9606"]
term
Term('NCBITaxon:9606', name='Homo sapiens')
[i.description for i in term.synonyms if i.scope == "EXACT"]
['human']
df_values = []
for term in onto.terms():

    # # term definition text
    definition = None if term.definition is None else term.definition.title()

    # get synonyms as a list
    synonyms_list = [i.description for i in term.synonyms if i.scope == "EXACT"]
    # concatenate synonyms into a string
    if len(synonyms_list) > 0:
        common_name = synonyms_list[0]
        synonyms_list = synonyms_list[1:]
    else:
        common_name = term.name
    synonyms = "|".join(synonyms_list)
    if len(synonyms) == 0:
        synonyms = None  # type:ignore

    # get 1st degree parents as a list
    superclasses = [
        s.id
        for s in term.superclasses(distance=1, with_self=False).to_set()
        if s.id.startswith("NCBITaxon")
    ]

    df_values.append(
        (term.id, common_name, term.name, definition, synonyms, superclasses)
    )
len(df_values)
2511800
import pandas as pd

df = pd.DataFrame(
    df_values,
    columns=[
        "ontology_id",
        "name",
        "scientific_name",
        "definition",
        "synonyms",
        "parents",
    ],
).set_index("ontology_id")
df
name scientific_name definition synonyms parents
ontology_id
NCBITaxon:1 root root None None []
NCBITaxon:10 Cellvibrio Cellvibrio None None [NCBITaxon:1706371]
NCBITaxon:100 Ancylobacter aquaticus Ancylobacter aquaticus None None [NCBITaxon:99]
NCBITaxon:100000 Herbaspirillum sp. BA12 Herbaspirillum sp. BA12 None None [NCBITaxon:2624150]
NCBITaxon:1000000 Microbacterium sp. 6.11-VPa Microbacterium sp. 6.11-VPa None None [NCBITaxon:2609290]
... ... ... ... ... ...
NCBITaxon:superorder superorder superorder None None [NCBITaxon:taxonomic_rank]
NCBITaxon:superphylum superphylum superphylum None None [NCBITaxon:taxonomic_rank]
NCBITaxon:taxonomic_rank taxonomic rank taxonomic rank None None []
NCBITaxon:tribe tribe tribe None None [NCBITaxon:taxonomic_rank]
NCBITaxon:varietas varietas varietas None None [NCBITaxon:taxonomic_rank]

2511800 rows × 5 columns

df.name = df.name.str.lower()
df.loc["NCBITaxon:9606"]
name                          human
scientific_name        Homo sapiens
definition                     None
synonyms                       None
parents            [NCBITaxon:9605]
Name: NCBITaxon:9606, dtype: object
df.to_parquet(
    "/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/df_all__ncbitaxon__2023-06-20__Organism.parquet"
)
from bionty.dev._md5 import calculate_md5
calculate_md5(
    "/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/df_all__ncbitaxon__2023-06-20__Organism.parquet"
)
'00d97ba65627f1cd65636d2df22ea76c'