Exploring ensembl ftp¶
from collections import namedtuple
import pandas as pd
from bionty import Species
from nbproject import header
header()
id | 4rPX1NdzSZVS |
version | draft |
time_init | 2022-09-26 09:32 |
time_run | 2022-09-26 09:32 |
pypackage | nbproject==0.5.3 pandas==1.4.3 |
common_name = "human"
species = Species().df.loc["human"]
baseurl = f"http://ftp.ensembl.org/pub/current_tsv/{human.scientific_name}/"
checksums = baseurl + "CHECKSUMS"
version = pd.read_fwf(checksums, header=None).iloc[1][2].split(".")[2]
version
'107'
file_prefix = f"{species.scientific_name.capitalize()}.{species.assembly}.{version}."
file_prefix
'Homo_sapiens.GRCh38.107.'
filenames = dict(
canonical="canonical.tsv",
ena="ena.tsv.gz",
entrez="entrez.tsv.gz",
refseq="refseq.tsv.gz",
uniprot="uniprot.tsv.gz",
)
filenames
{'canonical': 'canonical.tsv',
'ena': 'ena.tsv.gz',
'entrez': 'entrez.tsv.gz',
'karyotype': 'karyotype.tsv.gz',
'refseq': 'refseq.tsv.gz',
'uniprot': 'uniprot.tsv.gz'}
canonical = pd.read_table(
f"{baseurl}/{file_prefix}{filenames['canonical']}", header=None
)
canonical
0 | 1 | 2 | |
---|---|---|---|
0 | ENSG00000210049.1 | ENST00000387314.1 | Ensembl Canonical |
1 | ENSG00000211459.2 | ENST00000389680.2 | Ensembl Canonical |
2 | ENSG00000210077.1 | ENST00000387342.1 | Ensembl Canonical |
3 | ENSG00000210082.2 | ENST00000387347.2 | Ensembl Canonical |
4 | ENSG00000209082.1 | ENST00000386347.1 | Ensembl Canonical |
... | ... | ... | ... |
87439 | ENSG00000168509.20 | ENST00000336751.11 | MANE Select |
87440 | ENSG00000196859.8 | ENST00000355612.7 | MANE Select |
87441 | ENSG00000250479.9 | ENST00000484558.3 | MANE Select |
87442 | ENSG00000164488.12 | ENST00000366795.4 | MANE Select |
87443 | ENSG00000187533.14 | ENST00000344526.10 | MANE Select |
87444 rows × 3 columns
geneids = canonical[0].str.split(".", expand=True)
geneids[0].is_unique
False
len(geneids[0].unique())
68324
# the unique gene_stable_id in the ena table matches exactly the canonical table
ena = pd.read_table(f"{baseurl}/{file_prefix}{filenames['ena']}")
ena
species | taxid | gene_stable_id | transcript_stable_id | protein_stable_id | primary_accession | secondary_accession | |
---|---|---|---|---|---|---|---|
0 | Homo_sapiens | 9606 | ENSG00000000003 | ENST00000373020 | ENSP00000362111 | CM000685 | AAH12389 |
1 | Homo_sapiens | 9606 | ENSG00000000003 | ENST00000373020 | ENSP00000362111 | chrX | AAH12389 |
2 | Homo_sapiens | 9606 | ENSG00000000003 | ENST00000373020 | ENSP00000362111 | NC_000023 | AAH12389 |
3 | Homo_sapiens | 9606 | ENSG00000000003 | ENST00000612152 | ENSP00000482130 | CM000685 | NaN |
4 | Homo_sapiens | 9606 | ENSG00000000003 | ENST00000612152 | ENSP00000482130 | chrX | NaN |
... | ... | ... | ... | ... | ... | ... | ... |
775080 | Homo_sapiens | 9606 | ENSG00000290165 | ENST00000703415 | NaN | chrX | NaN |
775081 | Homo_sapiens | 9606 | ENSG00000290165 | ENST00000703415 | NaN | NC_000023 | NaN |
775082 | Homo_sapiens | 9606 | ENSG00000290166 | ENST00000702095 | NaN | CM000681 | NaN |
775083 | Homo_sapiens | 9606 | ENSG00000290166 | ENST00000702095 | NaN | chr19 | NaN |
775084 | Homo_sapiens | 9606 | ENSG00000290166 | ENST00000702095 | NaN | NC_000019 | NaN |
775085 rows × 7 columns
ena
68324
# only contains ensembl ids that have a mappable entrez id
entrez = pd.read_table(f"{baseurl}/{file_prefix}{filenames['entrez']}")
entrez
gene_stable_id | transcript_stable_id | protein_stable_id | xref | db_name | info_type | source_identity | xref_identity | linkage_type | |
---|---|---|---|---|---|---|---|---|---|
0 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | 83858 | EntrezGene | DEPENDENT | - | - | - |
1 | ENSG00000160072 | ENST00000472194 | - | 83858 | EntrezGene | DEPENDENT | - | - | - |
2 | ENSG00000160072 | ENST00000378736 | - | 83858 | EntrezGene | DEPENDENT | - | - | - |
3 | ENSG00000160072 | ENST00000485748 | - | 83858 | EntrezGene | DEPENDENT | - | - | - |
4 | ENSG00000160072 | ENST00000474481 | - | 83858 | EntrezGene | DEPENDENT | - | - | - |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
223255 | ENSG00000212907 | ENST00000361335 | ENSP00000354728 | 4539 | EntrezGene | DEPENDENT | - | - | - |
223256 | ENSG00000198886 | ENST00000361381 | ENSP00000354961 | 4538 | EntrezGene | DEPENDENT | - | - | - |
223257 | ENSG00000198786 | ENST00000361567 | ENSP00000354813 | 4540 | EntrezGene | DEPENDENT | - | - | - |
223258 | ENSG00000198695 | ENST00000361681 | ENSP00000354665 | 4541 | EntrezGene | DEPENDENT | - | - | - |
223259 | ENSG00000198727 | ENST00000361789 | ENSP00000354554 | 4519 | EntrezGene | DEPENDENT | - | - | - |
223260 rows × 9 columns
len(entrez["gene_stable_id"].unique())
28975
refseq = pd.read_table(f"{baseurl}/{file_prefix}{filenames['refseq']}")
refseq
gene_stable_id | transcript_stable_id | protein_stable_id | xref | db_name | info_type | source_identity | xref_identity | linkage_type | |
---|---|---|---|---|---|---|---|---|---|
0 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | NP_001304167 | RefSeq_peptide | INFERRED_PAIR | - | - | - |
1 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | NP_114127 | RefSeq_peptide | DIRECT | 100 | 100 | - |
2 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | NM_001317238 | RefSeq_mRNA | DIRECT | 90 | 82 | - |
3 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | NM_031921 | RefSeq_mRNA | DIRECT | 100 | 100 | - |
4 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | XM_005244806 | RefSeq_mRNA_predicted | DIRECT | 45 | 92 | - |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
263548 | ENSG00000212907 | ENST00000361335 | ENSP00000354728 | YP_003024034 | RefSeq_peptide | SEQUENCE_MATCH | 100 | 100 | - |
263549 | ENSG00000198886 | ENST00000361381 | ENSP00000354961 | YP_003024035 | RefSeq_peptide | SEQUENCE_MATCH | 100 | 100 | - |
263550 | ENSG00000198786 | ENST00000361567 | ENSP00000354813 | YP_003024036 | RefSeq_peptide | SEQUENCE_MATCH | 100 | 100 | - |
263551 | ENSG00000198695 | ENST00000361681 | ENSP00000354665 | YP_003024037 | RefSeq_peptide | SEQUENCE_MATCH | 100 | 100 | - |
263552 | ENSG00000198727 | ENST00000361789 | ENSP00000354554 | YP_003024038 | RefSeq_peptide | SEQUENCE_MATCH | 100 | 100 | - |
263553 rows × 9 columns
uniprot = pd.read_table(f"{baseurl}/{file_prefix}{filenames['uniprot']}")
uniprot
gene_stable_id | transcript_stable_id | protein_stable_id | xref | db_name | info_type | source_identity | xref_identity | linkage_type | |
---|---|---|---|---|---|---|---|---|---|
0 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | Q5T9A4 | Uniprot/SWISSPROT | DIRECT | 100 | 100 | - |
1 | ENSG00000160072 | ENST00000673477 | ENSP00000500094 | Q5T9A4-1 | Uniprot_isoform | DIRECT | - | - | - |
2 | ENSG00000160072 | ENST00000308647 | ENSP00000311766 | A0A5K1VW56 | Uniprot/SPTREMBL | DIRECT | 100 | 100 | - |
3 | ENSG00000142611 | ENST00000511072 | ENSP00000426975 | D6RDW0 | Uniprot/SPTREMBL | DIRECT | 100 | 100 | - |
4 | ENSG00000142611 | ENST00000378391 | ENSP00000367643 | Q9HAZ2 | Uniprot/SWISSPROT | DIRECT | - | - | - |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
158326 | ENSG00000198695 | ENST00000361681 | ENSP00000354665 | P03923 | Uniprot/SWISSPROT | DIRECT | 100 | 100 | - |
158327 | ENSG00000198695 | ENST00000361681 | ENSP00000354665 | A0A1X7RCR6 | Uniprot/SPTREMBL | SEQUENCE_MATCH | 100 | 100 | - |
158328 | ENSG00000198695 | ENST00000361681 | ENSP00000354665 | U5Z977 | Uniprot/SPTREMBL | SEQUENCE_MATCH | 100 | 100 | - |
158329 | ENSG00000198727 | ENST00000361789 | ENSP00000354554 | P00156 | Uniprot/SWISSPROT | DIRECT | 100 | 100 | - |
158330 | ENSG00000198727 | ENST00000361789 | ENSP00000354554 | Q0ZFD6 | Uniprot/SPTREMBL | SEQUENCE_MATCH | 100 | 100 | - |
158331 rows × 9 columns