Module: ncbi.datasets.package.dataset
Python API: ncbi.datasets.package.dataset
For example, once you download a package, you can stream through data reports using:
import sys
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.package import dataset
# the list of gene-ids to be downloaded
gene_ids: List[int] = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17]
zipfile_name = "gene_ds.zip"
# download the data package through the api-client.
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
try:
gene_dataset_download = gene_api.download_gene_package(
gene_ids,
include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
_preload_content=False,
)
with open(zipfile_name, "wb") as f:
f.write(gene_dataset_download.data)
except DatasetsApiException as e:
sys.exit(f"Exception when calling GeneApi: {e}\n")
# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())
# Use file types or names from the catalog to retrieve contents from specific files, e.g. protein fasta
for protein_fasta, file_name in package.get_files_by_type("PROTEIN_FASTA"):
print(file_name, protein_fasta[:100])
# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
print(f"{report.gene_id}\t{report.symbol}")
Python API: ncbi.datasets.package.dataset