Hi,
This is probably going to be a longer answer than you expected. I’ve put together some python code (end of this post) that will, I think, do what you are asking. That being said, there’s a lot of freedom in this system, so I’m going to try to describe my process so that you can tweak the code should it not be giving you exactly what you want.
General resources
The solution, such as it is, depends on the Allen Institute’s “Restful Model Access (RMA)” API. Some generally useful resources when interacting with this API are.
- This post, which describes the RMA API more generally
- This post, which attempts to describe how RMA queries are put together
- This post, which attempts to catalog the kinds of filters and queries the RMA API can support in the most general sense
- And this post, which links to the data model backing the API
Specific solution
The python code below depends on the libraries requests
(which handles passing the query URLs to the API and getting results back) and pandas
(mostly for shaping the data and writing it out to a CSV). These can be installed with
pip install requests pandas
The solution performs two queries. First, it queries the model to find all of the NCBI genes that are a part of the “Developing Mouse” product. It then assembles a list of the internal IDs of these genes and uses it to query the StructureUnionize model as described in this post. This code ran for me in about 4 minutes and created as CSV file with ~ 49,000 rows.
import json
import pandas as pd
import requests
import time
def main():
# file that will be written out
dst_path = "structure_unionization_result.csv"
mouse_gene_id_list = get_list_of_mouse_genes()
print("====START GETTING ACTUAL EXPRESSION DATA====")
data = []
# chunk through mouse gene_id_list
gene_chunk_size = 500
for i0 in range(0, len(mouse_gene_id_list), gene_chunk_size):
gene_chunk = mouse_gene_id_list[i0:i0+gene_chunk_size]
data += get_expression_by_structure(
gene_id_list=gene_chunk
)
pd.DataFrame(data).to_csv(dst_path, index=False)
def get_list_of_mouse_genes():
"""
Query the API for all of the known NCBI mouse genes.
Return a sorted list of their 'id' values which can be
used for querying
The ID values being returned are IDs internal to the
Allen Institute API. When we use these IDs to query
for the actua gene expression, we will convert them
to Entrez IDs, acronyms, and names.
"""
t0 = time.time()
print("====QUERYING API FOR LIST OF ALL KNOWN MOUSE GENES====")
start_row = 0
n_total = 0
chunk_size = 10000
id_set = set()
while True:
query_url = (
"http://api.brain-map.org/api/v2/data/query.json?criteria="
"model::NcbiGene,"
"rma::criteria,organism[name$il'Mus musculus'],"
"rma::include,organism,products,"
"rma::options"
f"[num_rows$eq{chunk_size}]"
f"[start_row$eq{start_row}]"
)
raw_response = requests.get(query_url)
try:
response = json.loads(raw_response.text)
all_genes = response['msg']
except:
print("=======RAW RESULT=======")
print(raw_response.text)
print("=======")
raise
if response['num_rows'] == 0:
break
# only keep genes that are associated with
# 'Developing Mouse' products
dev_mouse_genes = []
for gene in all_genes:
keep_it = False
for prod in gene['products']:
if 'Developing Mouse' in prod['name']:
keep_it = True
break
if keep_it:
dev_mouse_genes.append(gene)
id_set = id_set.union(
set([int(gene['id']) for gene in dev_mouse_genes])
)
start_row += len(all_genes)
print(f' FOUND {len(id_set)} GENES SO FAR')
dur = time.time()-t0
print(f'FOUND {len(id_set)} GENES IN {dur:.2e} SECONDS')
return sorted(id_set)
def get_expression_by_structure(
gene_id_list):
"""
Query the API for gene expression in structure unionization
for all genes whose ids are in gene_id_list
Return a list of dicts characterizing the results.
"""
t0 = time.time()
print(f"QUERYING GENE IDs {min(gene_id_list)} through {max(gene_id_list)}")
chunk_size = 10000
start_row = 0
final_result = []
while True:
gene_filter = "genes[id$in" + ",".join([str(g) for g in gene_id_list]) + "]"
query_url = (
"http://api.brain-map.org/api/v2/data/"
"query.json?criteria=model::StructureUnionize,"
"rma::criteria,"
"section_data_set[delegate$eqfalse] ("
f"{gene_filter}"
"),"
"structure("
"structure_sets_structures("
"structure_set[name$eq'Developing Mouse - Coarse']"
")"
"),"
"rma::indclude,section_data_set(specimen(donor(age))),"
f"&start_row={start_row}&num_rows={chunk_size}"
"&order=ages.embryonic$desc,ages.days,structures.graph_order"
"&tabular=structure_unionizes.section_data_set_id,"
"genes.acronym as gene_acronym,"
"genes.name as gene_name,"
"genes.entrez_id as gene_entrez_id,"
"ages.name as age,"
"ages.embryonic,"
"ages.days as age_in_days,"
"donors.strain as donor_strain,"
"donors.sex as donor_sex,"
"structures.acronym as structure_acronym,"
"structures.name as structure_name,"
"structures.graph_order as structure_graph_order,"
"structure_unionizes.expression_energy"
)
raw_result = requests.get(query_url)
try:
result = json.loads(raw_result.text)
if start_row == 0:
print(f" EXPECT TO FIND {result['total_rows']} DATA ENTRIES")
except:
print("=======RAW RESULT=======")
print(raw_result.text)
print("=======")
raise
if not result['success']:
print(raw_result.text)
if result['num_rows'] == 0:
break
final_result += result['msg']
start_row += len(result['msg'])
print(f' got {len(final_result)}')
dur = time.time() -t0
print(f" FOUND {len(final_result)} DATA ENTRIES IN {dur:.2e} SECONDS")
return final_result
if __name__ == "__main__":
main()