Organism
: ncbitaxon, 2023-06-20¶
import bionty as bt
onto = bt.Ontology(
"/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/ontology_all__ncbitaxon__2023-06-20__Organism"
)
onto
Ontology('/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/ontology_all__ncbitaxon__2023-06-20__Organism', timeout=100)
term = onto["NCBITaxon:9606"]
term
Term('NCBITaxon:9606', name='Homo sapiens')
[i.description for i in term.synonyms if i.scope == "EXACT"]
['human']
df_values = []
for term in onto.terms():
# # term definition text
definition = None if term.definition is None else term.definition.title()
# get synonyms as a list
synonyms_list = [i.description for i in term.synonyms if i.scope == "EXACT"]
# concatenate synonyms into a string
if len(synonyms_list) > 0:
common_name = synonyms_list[0]
synonyms_list = synonyms_list[1:]
else:
common_name = term.name
synonyms = "|".join(synonyms_list)
if len(synonyms) == 0:
synonyms = None # type:ignore
# get 1st degree parents as a list
superclasses = [
s.id
for s in term.superclasses(distance=1, with_self=False).to_set()
if s.id.startswith("NCBITaxon")
]
df_values.append(
(term.id, common_name, term.name, definition, synonyms, superclasses)
)
len(df_values)
2511800
import pandas as pd
df = pd.DataFrame(
df_values,
columns=[
"ontology_id",
"name",
"scientific_name",
"definition",
"synonyms",
"parents",
],
).set_index("ontology_id")
df
name | scientific_name | definition | synonyms | parents | |
---|---|---|---|---|---|
ontology_id | |||||
NCBITaxon:1 | root | root | None | None | [] |
NCBITaxon:10 | Cellvibrio | Cellvibrio | None | None | [NCBITaxon:1706371] |
NCBITaxon:100 | Ancylobacter aquaticus | Ancylobacter aquaticus | None | None | [NCBITaxon:99] |
NCBITaxon:100000 | Herbaspirillum sp. BA12 | Herbaspirillum sp. BA12 | None | None | [NCBITaxon:2624150] |
NCBITaxon:1000000 | Microbacterium sp. 6.11-VPa | Microbacterium sp. 6.11-VPa | None | None | [NCBITaxon:2609290] |
... | ... | ... | ... | ... | ... |
NCBITaxon:superorder | superorder | superorder | None | None | [NCBITaxon:taxonomic_rank] |
NCBITaxon:superphylum | superphylum | superphylum | None | None | [NCBITaxon:taxonomic_rank] |
NCBITaxon:taxonomic_rank | taxonomic rank | taxonomic rank | None | None | [] |
NCBITaxon:tribe | tribe | tribe | None | None | [NCBITaxon:taxonomic_rank] |
NCBITaxon:varietas | varietas | varietas | None | None | [NCBITaxon:taxonomic_rank] |
2511800 rows × 5 columns
df.name = df.name.str.lower()
df.loc["NCBITaxon:9606"]
name human
scientific_name Homo sapiens
definition None
synonyms None
parents [NCBITaxon:9605]
Name: NCBITaxon:9606, dtype: object
df.to_parquet(
"/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/df_all__ncbitaxon__2023-06-20__Organism.parquet"
)
from bionty.dev._md5 import calculate_md5
calculate_md5(
"/Users/sunnysun/Documents/repos.nosync/bionty/bionty/_dynamic/df_all__ncbitaxon__2023-06-20__Organism.parquet"
)
'00d97ba65627f1cd65636d2df22ea76c'