Batch download: Developing mouse brain

Hi,

This is probably going to be a longer answer than you expected. I’ve put together some python code (end of this post) that will, I think, do what you are asking. That being said, there’s a lot of freedom in this system, so I’m going to try to describe my process so that you can tweak the code should it not be giving you exactly what you want.

General resources

The solution, such as it is, depends on the Allen Institute’s “Restful Model Access (RMA)” API. Some generally useful resources when interacting with this API are.

  • This post, which describes the RMA API more generally
  • This post, which attempts to describe how RMA queries are put together
  • This post, which attempts to catalog the kinds of filters and queries the RMA API can support in the most general sense
  • And this post, which links to the data model backing the API

Specific solution

The python code below depends on the libraries requests (which handles passing the query URLs to the API and getting results back) and pandas (mostly for shaping the data and writing it out to a CSV). These can be installed with

pip install requests pandas

The solution performs two queries. First, it queries the model to find all of the NCBI genes that are a part of the “Developing Mouse” product. It then assembles a list of the internal IDs of these genes and uses it to query the StructureUnionize model as described in this post. This code ran for me in about 4 minutes and created as CSV file with ~ 49,000 rows.

import json
import pandas as pd
import requests
import time


def main():

    # file that will be written out
    dst_path = "structure_unionization_result.csv"

    mouse_gene_id_list = get_list_of_mouse_genes()

    print("====START GETTING ACTUAL EXPRESSION DATA====")

    data = []

    # chunk through mouse gene_id_list
    gene_chunk_size = 500
    for i0 in range(0, len(mouse_gene_id_list), gene_chunk_size):
        gene_chunk = mouse_gene_id_list[i0:i0+gene_chunk_size]
        data += get_expression_by_structure(
            gene_id_list=gene_chunk
        )

    pd.DataFrame(data).to_csv(dst_path, index=False)


def get_list_of_mouse_genes():
    """
    Query the API for all of the known NCBI mouse genes.

    Return a sorted list of their 'id' values which can be
    used for querying

    The ID values being returned are IDs internal to the
    Allen Institute API. When we use these IDs to query
    for the actua gene expression, we will convert them
    to Entrez IDs, acronyms, and names.
    """
    t0 = time.time()
    print("====QUERYING API FOR LIST OF ALL KNOWN MOUSE GENES====")
    start_row = 0
    n_total = 0
    chunk_size = 10000

    id_set = set()

    while True:

        query_url = (
            "http://api.brain-map.org/api/v2/data/query.json?criteria="
            "model::NcbiGene,"
            "rma::criteria,organism[name$il'Mus musculus'],"
            "rma::include,organism,products,"
            "rma::options"
            f"[num_rows$eq{chunk_size}]"
            f"[start_row$eq{start_row}]"
        )

        raw_response = requests.get(query_url)

        try:
            response = json.loads(raw_response.text)
            all_genes = response['msg']

        except:
            print("=======RAW RESULT=======")
            print(raw_response.text)
            print("=======")
            raise

        if response['num_rows'] == 0:
            break

        # only keep genes that are associated with
        # 'Developing Mouse' products
        dev_mouse_genes = []
        for gene in all_genes:
            keep_it = False
            for prod in gene['products']:
                if 'Developing Mouse' in prod['name']:
                    keep_it = True
                    break
            if keep_it:
                dev_mouse_genes.append(gene)

        id_set = id_set.union(
            set([int(gene['id']) for gene in dev_mouse_genes])
        )
        start_row += len(all_genes)
        print(f'    FOUND {len(id_set)} GENES SO FAR')

    dur = time.time()-t0
    print(f'FOUND {len(id_set)} GENES IN {dur:.2e} SECONDS')

    return sorted(id_set)


def get_expression_by_structure(
        gene_id_list):
    """
    Query the API for gene expression in structure unionization
    for all genes whose ids are in gene_id_list

    Return a list of dicts characterizing the results.
    """
    t0 = time.time()
    print(f"QUERYING GENE IDs {min(gene_id_list)} through {max(gene_id_list)}")
    chunk_size = 10000
    start_row = 0

    final_result = []
    while True:

        gene_filter = "genes[id$in" + ",".join([str(g) for g in gene_id_list]) + "]"

        query_url = (
            "http://api.brain-map.org/api/v2/data/"
            "query.json?criteria=model::StructureUnionize,"
            "rma::criteria,"
            "section_data_set[delegate$eqfalse] ("
                f"{gene_filter}"
            "),"
            "structure("
                "structure_sets_structures("
                    "structure_set[name$eq'Developing Mouse - Coarse']"
                ")"
            "),"
            "rma::indclude,section_data_set(specimen(donor(age))),"
            f"&start_row={start_row}&num_rows={chunk_size}"
            "&order=ages.embryonic$desc,ages.days,structures.graph_order"
            "&tabular=structure_unionizes.section_data_set_id,"
                "genes.acronym as gene_acronym,"
                "genes.name as gene_name,"
                "genes.entrez_id as gene_entrez_id,"
                "ages.name as age,"
                "ages.embryonic,"
                "ages.days as age_in_days,"
                "donors.strain as donor_strain,"
                "donors.sex as donor_sex,"
                "structures.acronym as structure_acronym,"
                "structures.name as structure_name,"
                "structures.graph_order as structure_graph_order,"
                "structure_unionizes.expression_energy"
        )

        raw_result = requests.get(query_url)
        try:
            result = json.loads(raw_result.text)
            if start_row == 0:
                print(f"    EXPECT TO FIND {result['total_rows']} DATA ENTRIES")
        except:
            print("=======RAW RESULT=======")
            print(raw_result.text)
            print("=======")
            raise

        if not result['success']:
            print(raw_result.text)
        if result['num_rows'] == 0:
            break
        final_result += result['msg']
        start_row += len(result['msg'])
        print(f'    got {len(final_result)}')
    dur = time.time() -t0
    print(f"    FOUND {len(final_result)} DATA ENTRIES IN {dur:.2e} SECONDS")

    return final_result


if __name__ == "__main__":
    main()

1 Like