Ensemble REST connection¶

import pandas as pd


SPECIES_COLS = [
    "scientific_name",
    "display_name",
    "common_name",
    "taxon_id",
    "assembly",
    "accession",
    "release",
]


import xmltodict

ENSEMBL_REST = "http://rest.ensembl.org"
ENSEMBL_REST_EXT = Namespace(
    **{
        "SPECIES_INFO": "/info/species?",  # all species info
        "ARCHIVE_IDS": "/archive/id",  # retrieves the latest version of ids
        "XREFS_ID": "/xrefs/id/",
        "LOOKUP_IDS": "/lookup/id",
        "LOOKUP_SYMBOLS": "/lookup/symbol/",
        "SEQ_IDS": "/sequence/id",
    }
)


import asyncio
import sys
from typing import Any

import httpx
import nest_asyncio
import requests  # type: ignore
from syncer import sync

nest_asyncio.apply()  # Fixes the issue with iPython compatibility


async def _async_get(client, url, content_type="application/json"):
    resp = await client.get(url, headers={"content-type": content_type})

    if resp.headers["content-type"].find("application/json") > -1:
        return resp.json()
    else:
        return resp.text


@sync
async def get_request_async(
    server_request: str, terms, content_type="application/json"
):
    async with httpx.AsyncClient() as client:
        tasks = []

        for term in terms:
            url = f"{server_request}{term}"
            tasks.append(
                asyncio.ensure_future(
                    _async_get(client, url, content_type=content_type)
                )
            )

        resps = await asyncio.gather(*tasks)

    return resps


def get_request(server, request, content_type="application/json", **kwds) -> Any:
    """Fetch an endpoint from the server.

    Allow overriding of default content-type
    """
    r = requests.get(server + request, headers={"Accept": content_type}, **kwds)

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    if content_type == "application/json":
        return r.json()
    else:
        return r.text


def post_request(server, request, data, content_type="application/json", **kwds) -> Any:
    """POST requests."""
    r = requests.post(
        server + request, headers={"Content-Type": content_type}, data=data, **kwds
    )

    if not r.ok:
        r.raise_for_status()
        sys.exit()

    if content_type == "application/json":
        return r.json()
    else:
        return r.text


class EnsemblREST:
    """Queries via the Ensembl REST APIs."""

    def __init__(self) -> None:
        self._server = ENSEMBL_REST

    @property
    def server(self):
        """ENSEMBL_REST."""
        return self._server

    def _config_data(self, ids, label):
        ids = str(ids).replace("'", '"')
        return f'{{ "{label}" : {ids} }}'

    def species_info(self, return_raw=False):
        """ENSEMBL_REST_EXT.SPECIES_INFO."""
        ext = ENSEMBL_REST_EXT.SPECIES_INFO
        res = get_request(self.server, ext, "text/xml")
        if return_raw:
            return res
        else:
            return xmltodict.parse(res)["opt"]["data"]["species"]

    def xref(self, ids, **kwargs):
        """Retrieve external references of Ensembl ids.

        See https://rest.ensembl.org/documentation/info/xref_id
        """
        if isinstance(ids, str):
            ext = f"{ENSEMBL_REST_EXT.XREFS_ID}{ids}?"
            res = get_request(self.server, ext, **kwargs)
        else:
            res = get_request_async(self.server + ENSEMBL_REST_EXT.XREFS_ID, ids)
        return res

    def archive_ids(self, ids):
        """Retrieve the latest version for a set of identifiers."""
        ext = ENSEMBL_REST_EXT.ARCHIVE_IDS
        res = post_request(self.server, ext, data=self._config_data(ids, "id"))
        return res

    def lookup_ids(self, ids, **kwargs):
        """Find the species and database for several identifiers.

        See https://rest.ensembl.org/documentation/info/lookup_post
        """
        ext = ENSEMBL_REST_EXT.LOOKUP_IDS
        res = post_request(
            self.server, ext, data=self._config_data(ids, "id"), **kwargs
        )
        return res

    def lookup_symbols(self, symbols, species="homo_sapiens", **kwargs):
        """Find the species and database for symbols in a linked external database."""
        ext = f"{ENSEMBL_REST_EXT.LOOKUP_SYMBOLS}{species}"
        res = post_request(self.server, ext, data=self._config_data(symbols), **kwargs)
        return res

    def seq_ids(self, ids, **kwargs):
        """Request multiple types of sequence by a stable identifier list."""
        ext = ENSEMBL_REST_EXT.SEQ_IDS
        res = post_request(self.server, ext, data=self._config_data(ids), **kwargs)
        return res


def update_species_table() -> None:
    """Fetch species table from Ensembl REST.

    Returns:
        a dataframe
    """
    entries = EnsemblREST().species_info()

    # format into a dataframe
    sp_dict: dict = {}
    cols = [
        "display_name",
        "common_name",
        "taxon_id",
        "assembly",
        "accession",
        "release",
    ]
    for entry in entries:
        id = entry.get("@name")
        sp_dict[id] = []
        for i in SPECIES_COLS:
            if i != "scientific_name":
                sp_dict[id].append(entry.get(f"@{i}"))
    sp_df = pd.DataFrame.from_dict(sp_dict).T
    sp_df.columns = cols
    sp_df.index.name = "scientific_name"
    sp_df["display_name"] = sp_df["display_name"].str.lower()
    # Adding a short_name column
    sp_df["short_name"] = [f'{i[0].lower()}{i.split("_")[-1]}' for i in sp_df.index]
    # Set display_name as the index for std_id
    sp_df = sp_df.reset_index().set_index("display_name")
    sp_df.to_csv(SPECIES_FILENAME, header=True, index=True)

---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
Input In [2], in <cell line: 17>()
      4 SPECIES_COLS = [
      5     "scientific_name",
      6     "display_name",
   (...)
     11     "release",
     12 ]
     15 import xmltodict
---> 17 from ._httpx import get_request, get_request_async, post_request
     18 from ._urls import ENSEMBL_REST, ENSEMBL_REST_EXT
     21 class EnsemblREST:

ImportError: attempted relative import with no known parent package