Package: ncbi.datasets.package

Utilities to work with NCBI Datasets data packages

For example, once you download a package, you can stream through data reports using:

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi

from ncbi.datasets.package import dataset


# the list of gene-ids to be downloaded
gene_ids: List[int] = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17]

zipfile_name = "gene_ds.zip"

# download the data package through the api-client.
with DatasetsApiClient() as api_client:
    gene_api = DatasetsGeneApi(api_client)
    try:
        gene_dataset_download = gene_api.download_gene_package(
            gene_ids,
            include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
            _preload_content=False,
        )

        with open(zipfile_name, "wb") as f:
            f.write(gene_dataset_download.data)
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling GeneApi: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# Use file types or names from the catalog to retrieve contents from specific files, e.g. protein fasta
for protein_fasta, file_name in package.get_files_by_type("PROTEIN_FASTA"):
    print(file_name, protein_fasta[:100])

# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
    print(f"{report.gene_id}\t{report.symbol}")

Module: ncbi.datasets.package.dataset

Python API: ncbi.datasets.package.dataset

Generated November 25, 2024