CellLine
: clo; 2022-03-21¶
The owl files are missing metadata including definition and synonyms for clo, so we manually parse them from the csv file.
Download clo.csv.gz
from: https://data.bioontology.org/ontologies/CLO/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb&download_format=csv
https://bioportal.bioontology.org/ontologies/CLO
import pandas as pd
def df_from_csv(csv_filepath, prefix):
df = pd.read_csv(csv_filepath)
# df = df[~df["Obsolete"]]
df["ontology_id"] = (
df["Class ID"]
.str.replace("http://purl.obolibrary.org/obo/", "")
.str.replace("_", ":")
)
df = df[df["ontology_id"].str.startswith("CLO")]
df.drop(columns=["definition"], inplace=True)
df.rename(
columns={
"Preferred Label": "name",
"Synonyms": "synonyms",
"Definitions": "definition",
"Parents": "parents",
},
inplace=True,
)
parents = []
for p in df["parents"]:
try:
plist = [
i
for i in p.replace("http://purl.obolibrary.org/obo/", "")
.replace("_", ":")
.split("|")
if i.startswith(prefix)
]
parents.append(plist)
except AttributeError:
parents.append([])
df["parents"] = parents
df = df[["ontology_id", "name", "definition", "synonyms", "parents"]]
df = df.sort_values("ontology_id")
# drop duplicated names, keep the last record
df = df.drop_duplicates("name", keep="last")
return df.set_index("ontology_id")
df = df_from_csv("clo.csv.gz", "CLO")
/var/folders/m8/s9fnpvhj7qsgng70w8xpts_m0000gn/T/ipykernel_29020/626069511.py:5: DtypeWarning: Columns (8,9,10,12,13,15,17,18,20,23,35,39,40,41,42,43,46,48,49,50,51,53,54,55,56,57,60,63,64,65,70,71,72,77,82,88,89,92,98,99,101,104,105,108,110,111,112,113,115,116,117,118,124,126,127,128,131,132,135,136,137,139,140,143,144,145,149,150,151,152,153,154,158,159,160,161,164,165,166,168,169,170,171,172,173,174,175,178,181,182,184,186,189,190,197,198,199,200,201,202,204,205,206,209,210,211,212,213,215,216,219,220,221,246,257,258,260,261,263,269,270,272,273,274,276,278,284,292,296,297,299,300,303,305,313,316,318,319,322,324,326,327,328,330,333,334,335,336,338,339,340,341,342,343,344,345,346,348,350,352,355,356,359,360,361,362,363,364,365,366,367,368,369,370,372,375,376,377,380,382,383,384,385,387,388,389,390,391,396,397,400,403,404) have mixed types. Specify dtype option on import or set low_memory=False.
df = pd.read_csv(csv_filepath)
/var/folders/m8/s9fnpvhj7qsgng70w8xpts_m0000gn/T/ipykernel_29020/626069511.py:8: FutureWarning: The default value of regex will change from True to False in a future version.
df["Class ID"]
df
name | definition | synonyms | parents | |
---|---|---|---|---|
ontology_id | ||||
CLO:0000000 | cell line cell culturing | a maintaining cell culture process that keeps ... | NaN | [] |
CLO:0000001 | cell line cell | A cultured cell that is part of a cell line - ... | NaN | [] |
CLO:0000002 | suspension cell line culturing | suspension cell line culturing is a cell line ... | NaN | [CLO:0000000] |
CLO:0000003 | adherent cell line culturing | adherent cell line culturing is a cell line cu... | NaN | [CLO:0000000] |
CLO:0000004 | cell line cell modification | a material processing that modifies an existin... | NaN | [] |
... | ... | ... | ... | ... |
CLO:0051617 | RCB0187 cell | A immortal medaka cell line cell that has the ... | RCB0187|OLHE-131 | [CLO:0009822] |
CLO:0051618 | RCB2945 cell | A immortal medaka cell line cell that has the ... | RCB2945|DIT29 | [CLO:0009822] |
CLO:0051619 | RCB0184 cell | A immortal medaka cell line cell that has the ... | OLF-136|RCB0184 | [CLO:0009822] |
CLO:0051620 | RCB0188 cell | A immortal medaka cell line cell that has the ... | RCB0188|OLME-104 | [CLO:0009822] |
CLO:0051621 | RCB2319 cell | A immortal cell line cell that has the charact... | LACF-NaNaI|RCB2319 | [CLO:0000019] |
39037 rows × 4 columns
df.loc["CLO:0007050"]
name K 562 cell
definition disease: leukemia, chronic myeloid
synonyms K-562|KO|GM05372E|K.562|K562|GM05372|K 562
parents [CLO:0000511]
Name: CLO:0007050, dtype: object
# adding RPE1 and RPE to synonyms as it's used quite often
df.loc["CLO:0004290"]["synonyms"] += "|RPE1|RPE-1|RPE"
df.loc["CLO:0004290"]["synonyms"]
'hTERT RPE-1|RPE1|RPE-1|RPE'
df.to_parquet("df_all__clo__2022-03-21__CellLine.parquet")