import pandas as pd
SPECIES_COLS = [
"scientific_name",
"display_name",
"common_name",
"taxon_id",
"assembly",
"accession",
"release",
]
import xmltodict
ENSEMBL_REST = "http://rest.ensembl.org"
ENSEMBL_REST_EXT = Namespace(
**{
"SPECIES_INFO": "/info/species?", # all species info
"ARCHIVE_IDS": "/archive/id", # retrieves the latest version of ids
"XREFS_ID": "/xrefs/id/",
"LOOKUP_IDS": "/lookup/id",
"LOOKUP_SYMBOLS": "/lookup/symbol/",
"SEQ_IDS": "/sequence/id",
}
)
import asyncio
import sys
from typing import Any
import httpx
import nest_asyncio
import requests # type: ignore
from syncer import sync
nest_asyncio.apply() # Fixes the issue with iPython compatibility
async def _async_get(client, url, content_type="application/json"):
resp = await client.get(url, headers={"content-type": content_type})
if resp.headers["content-type"].find("application/json") > -1:
return resp.json()
else:
return resp.text
@sync
async def get_request_async(
server_request: str, terms, content_type="application/json"
):
async with httpx.AsyncClient() as client:
tasks = []
for term in terms:
url = f"{server_request}{term}"
tasks.append(
asyncio.ensure_future(
_async_get(client, url, content_type=content_type)
)
)
resps = await asyncio.gather(*tasks)
return resps
def get_request(server, request, content_type="application/json", **kwds) -> Any:
"""Fetch an endpoint from the server.
Allow overriding of default content-type
"""
r = requests.get(server + request, headers={"Accept": content_type}, **kwds)
if not r.ok:
r.raise_for_status()
sys.exit()
if content_type == "application/json":
return r.json()
else:
return r.text
def post_request(server, request, data, content_type="application/json", **kwds) -> Any:
"""POST requests."""
r = requests.post(
server + request, headers={"Content-Type": content_type}, data=data, **kwds
)
if not r.ok:
r.raise_for_status()
sys.exit()
if content_type == "application/json":
return r.json()
else:
return r.text
class EnsemblREST:
"""Queries via the Ensembl REST APIs."""
def __init__(self) -> None:
self._server = ENSEMBL_REST
@property
def server(self):
"""ENSEMBL_REST."""
return self._server
def _config_data(self, ids, label):
ids = str(ids).replace("'", '"')
return f'{{ "{label}" : {ids} }}'
def species_info(self, return_raw=False):
"""ENSEMBL_REST_EXT.SPECIES_INFO."""
ext = ENSEMBL_REST_EXT.SPECIES_INFO
res = get_request(self.server, ext, "text/xml")
if return_raw:
return res
else:
return xmltodict.parse(res)["opt"]["data"]["species"]
def xref(self, ids, **kwargs):
"""Retrieve external references of Ensembl ids.
See https://rest.ensembl.org/documentation/info/xref_id
"""
if isinstance(ids, str):
ext = f"{ENSEMBL_REST_EXT.XREFS_ID}{ids}?"
res = get_request(self.server, ext, **kwargs)
else:
res = get_request_async(self.server + ENSEMBL_REST_EXT.XREFS_ID, ids)
return res
def archive_ids(self, ids):
"""Retrieve the latest version for a set of identifiers."""
ext = ENSEMBL_REST_EXT.ARCHIVE_IDS
res = post_request(self.server, ext, data=self._config_data(ids, "id"))
return res
def lookup_ids(self, ids, **kwargs):
"""Find the species and database for several identifiers.
See https://rest.ensembl.org/documentation/info/lookup_post
"""
ext = ENSEMBL_REST_EXT.LOOKUP_IDS
res = post_request(
self.server, ext, data=self._config_data(ids, "id"), **kwargs
)
return res
def lookup_symbols(self, symbols, species="homo_sapiens", **kwargs):
"""Find the species and database for symbols in a linked external database."""
ext = f"{ENSEMBL_REST_EXT.LOOKUP_SYMBOLS}{species}"
res = post_request(self.server, ext, data=self._config_data(symbols), **kwargs)
return res
def seq_ids(self, ids, **kwargs):
"""Request multiple types of sequence by a stable identifier list."""
ext = ENSEMBL_REST_EXT.SEQ_IDS
res = post_request(self.server, ext, data=self._config_data(ids), **kwargs)
return res
def update_species_table() -> None:
"""Fetch species table from Ensembl REST.
Returns:
a dataframe
"""
entries = EnsemblREST().species_info()
# format into a dataframe
sp_dict: dict = {}
cols = [
"display_name",
"common_name",
"taxon_id",
"assembly",
"accession",
"release",
]
for entry in entries:
id = entry.get("@name")
sp_dict[id] = []
for i in SPECIES_COLS:
if i != "scientific_name":
sp_dict[id].append(entry.get(f"@{i}"))
sp_df = pd.DataFrame.from_dict(sp_dict).T
sp_df.columns = cols
sp_df.index.name = "scientific_name"
sp_df["display_name"] = sp_df["display_name"].str.lower()
# Adding a short_name column
sp_df["short_name"] = [f'{i[0].lower()}{i.split("_")[-1]}' for i in sp_df.index]
# Set display_name as the index for std_id
sp_df = sp_df.reset_index().set_index("display_name")
sp_df.to_csv(SPECIES_FILENAME, header=True, index=True)