Plant Gene: ensembl, release-57

arabidopsis thaliana

Install mysqlclient: https://pypi.org/project/mysqlclient/

from bionty.base.entities._gene import EnsemblGene
version = "release-57"
ensembl_gene = EnsemblGene(organism="arabidopsis thaliana", taxa="plants", version=version)
df = ensembl_gene.download_df()
• fetching records from the core DB...
• fetching records from the external DBs...
! duplicated #rows ensembl_gene_id with ncbi_gene_id: 438
! no ensembl_gene_id found, writing to table_id column.
✓ downloaded Gene table containing 75285 entries.
df
stable_id symbol ncbi_gene_id biotype description synonyms index
0 AT1G01010 NAC001 NaN protein_coding NAC domain containing protein 1 [Source:NCBI g... T25K16_1 43080
1 AT1G01010 NAC001 NaN protein_coding NAC domain containing protein 1 [Source:NCBI g... T25K16.1 43079
2 AT1G01010 NAC001 NaN protein_coding NAC domain containing protein 1 [Source:NCBI g... NAC domain containing protein 1 43078
3 AT1G01010 NAC001 NaN protein_coding NAC domain containing protein 1 [Source:NCBI g... ANAC001 43077
4 AT1G01020 ARV1 NaN protein_coding ARV1 family protein [Source:NCBI gene (formerl... T25K16_2 46552
... ... ... ... ... ... ... ...
75280 ATMG09730 None NaN tRNA None None 1533
75281 ATMG09740 None NaN tRNA None None 1390
75282 ATMG09950 None NaN tRNA None None 435
75283 ATMG09960 None NaN tRNA None None 1466
75284 ATMG09980 None NaN tRNA None None 1420

75285 rows × 7 columns

# https://github.com/laminlabs/bionty-base/issues/533
df["description"] = df["description"].str.replace(r"\[.*?\]", "", regex=True)
df.to_parquet(f"df_arabidopsis thaliana__ensembl__{version}__Gene.parquet")
df_legacy = ensembl_gene.download_legacy_ids_df(df, col="stable_id")
df_legacy.shape
(0, 0)