Hello @Zsofi6
There are two ways (that I can think of) one might go about doing this. The first is a more “modern” (by which I mean made available in the last two years) way that involves downloading some our spatial transcriptomics data and doing the statistical analysis yourself on the cell-by-gene matrix. The second is something of a “legacy” product provided by the Institute and involves using our APIs to download statistical summaries of older datasets directly. I will discuss both below.
Using the spatial transcriptomics data
This approach uses the python API provided by our abc_atlas_access library, which allows users to download the data behind our ABC Atlas product. There are a lot of good Jupyter notebook tutorials around abc_atlas_access. I will draw your attention to the following
- the notebooks linked on this page show how to access the cell-by-gene data from our Whole Mouse Brain spatial transcriptomics dataset
- the notebooks linked on this page show how to access the linkage between cells in the cell-by-gene data and CCF (“Common Coordinate Framework”) parcellations.
To bring it all together, here is some python code that uses abc_atlas_access to download the relevant data and create a CSV file listing every cell in the dataset, their expression values in 4 arbitrarily chosen genes, and the parcellation terms to which they have been registered
import abc_atlas_access.abc_atlas_cache.abc_project_cache as cache_module
import anndata
import numpy as np
import pandas as pd
import pathlib
import time
def main():
global_t0 = time.time()
dst_path = "cell_by_gene_by_parcellation.csv"
# Change this to point to the directory into which you wish to
# download the data.
cache_dir = pathlib.Path(
"/Users/scott.daniel/KnowledgeEngineering/abc_cache"
)
assert cache_dir.is_dir()
cache = cache_module.AbcProjectCache.from_cache_dir(cache_dir)
# load a dataframe associating each cell to CCF coordinates
# and CCF parcellations
cell_metadata = cache.get_metadata_dataframe(
directory="MERFISH-C57BL6J-638850-CCF",
file_name="ccf_coordinates"
)
# load a dataframe mapping CCF parcellations to structures,
# substructures, divisions, etc.
ccf_parcellations = cache.get_metadata_dataframe(
directory="Allen-CCF-2020",
file_name="parcellation_to_parcellation_term_membership"
)
# the path to the h5ad file listing the gene expression
# profile of each cel
data_path = cache.get_data_path(
directory="MERFISH-C57BL6J-638850",
file_name="C57BL6J-638850/raw"
)
print("===WE HAVE DOWNLOADED ALL NECESSARY DATA===")
t0 = time.time()
# create a dataframe containing the gene expression profiles
# of each cell in a subset of genes.
desired_genes = ['Prrxl1', 'Scn7a', 'Nrn1', 'Serpine2']
src = anndata.read_h5ad(data_path, 'r')
gene_idx = np.where(np.isin(src.var.gene_symbol, desired_genes))[0]
# make sure gene symbols are in the same order as gene_idx
gene_symbols = src.var.gene_symbol.values[gene_idx]
expression_df = pd.DataFrame(
data=src.X[:, gene_idx].toarray(),
columns=gene_symbols,
index=src.obs.index.values)
# join gene expression to the CCF metadata for each cell
cell_metadata = cell_metadata.join(
expression_df,
on='cell_label'
)
# loop over the different classes of CCF parcellation ('structure',
# 'substructure', 'division') and add the label and name of the
# CCF parcellation assigned to each cell
for order in set(ccf_parcellations.parcellation_term_set_name):
ccf_subset = ccf_parcellations[
ccf_parcellations.parcellation_term_set_name == order
].set_index("parcellation_index")
ccf_subset = ccf_subset.rename(
{"parcellation_term_label": f"{order}_label",
"parcellation_term_name": f"{order}_name"},
axis=1
)[[f"{order}_label", f"{order}_name"]]
cell_metadata = cell_metadata.join(
ccf_subset,
on="parcellation_index"
)
# Some cells failed quality control and thus do not appear
# in the cell-by-gene expression matrix. These cells will have
# NULL gene expression values. Just drop them from the dataframe.
cell_metadata.dropna(axis=0, subset=gene_symbols, inplace=True)
cell_metadata.to_csv(dst_path, index=False)
print("SUCCESS")
print(f"wrote {dst_path}")
dur = time.time()-t0
print(f'processing took {dur:.2e} seconds')
global_dur = time.time()-global_t0
print(f'with data download took {global_dur:.2e} seconds')
if __name__ == "__main__":
main()
Note: the first time you run this script it will take about 10 minutes and download 7.5 GB of data. You will be limited to the 550 genes measured in our spatial trancriptomics dataset. As alluded to in the notebooks linked above, there is also an imputed gene dataset that imputes gene expression for the spatial transcriptomics cells in a larger 8,000 gene panel. Pointing this script at that data will require downloading ~ 46 GB of data.
Using the legacy API
Older datasets published by the Institute are available through a RESTful API. Here is some code to download the summary expression data for a gene in all anatomical structures in a specified structure set. Note that the data is summarized per donor organism.
import json
import pandas as pd
import requests
import time
def main():
# file that will be written out
dst_path = "cell_by_region.csv"
data = get_expression_by_structure(
gene_symbol="Serpine2",
structure_set="Mouse - Areas"
)
pd.DataFrame(data).to_csv(dst_path, index=False)
print(f'WROTE RESULTS TO {dst_path}')
def get_expression_by_structure(
gene_symbol,
structure_set="Mouse Cell Types - Structures"):
"""
Query the API for gene expression in structure unionization
for genes whose acronym is specified by gene_symbol in the
set of structures specified by structure_set.
Return a list of dicts characterizing the results.
"""
t0 = time.time()
chunk_size = 10000
start_row = 0
final_result = []
while True:
gene_filter = f"genes[acronym$il'{gene_symbol}']"
query_url = (
"http://api.brain-map.org/api/v2/data/"
"query.json?criteria=model::StructureUnionize,"
"rma::criteria,"
"section_data_set[delegate$eqfalse] ("
f"{gene_filter}"
"),"
"structure("
"structure_sets_structures("
f"structure_set[name$eq'{structure_set}']"
")"
"),"
"rma::indclude,section_data_set(specimen(donor(age))),"
f"&start_row={start_row}&num_rows={chunk_size}"
"&order=ages.embryonic$desc,ages.days,structures.graph_order"
"&tabular=structure_unionizes.section_data_set_id,"
"genes.acronym as gene_acronym,"
"genes.name as gene_name,"
"genes.entrez_id as gene_entrez_id,"
"ages.name as age,"
"ages.embryonic,"
"ages.days as age_in_days,"
"donors.strain as donor_strain,"
"donors.sex as donor_sex,"
"structures.acronym as structure_acronym,"
"structures.name as structure_name,"
"structures.graph_order as structure_graph_order,"
"structure_unionizes.*"
)
raw_result = requests.get(query_url)
try:
result = json.loads(raw_result.text)
if start_row == 0:
print(f" EXPECT TO FIND {result['total_rows']} DATA ENTRIES")
except:
print("=======RAW RESULT=======")
print(raw_result.text)
print("=======")
raise
if not result['success']:
print(raw_result.text)
if result['num_rows'] == 0:
break
final_result += result['msg']
start_row += len(result['msg'])
print(f' got {len(final_result)}')
dur = time.time() -t0
print(f" FOUND {len(final_result)} DATA ENTRIES IN {dur:.2e} SECONDS")
return final_result
if __name__ == "__main__":
main()
You will need to specify a valid value for structure_set. Here is some code to list all of the valid structure sets in this API
import requests
import json
structure_query = (
"http://api.brain-map.org/api/v2/data/StructureSet/"
"query.json?start_row=0&order=structure_sets.name"
)
print(structure_query)
result = json.loads(requests.get(structure_query).text)
for structure_set in result['msg']:
print(f"'{structure_set['name']}' :: '{structure_set['description']}'")
The “interesting” data elements (those pertaining to gene expression) come from our StructureUnionizes data model, which is documented here.
More generally, should you need to modify the API queries, some helpful documentation can be found below
I know this was a lot. Please reach out if anything is confusing or not helpful.