import json
import numpy as np
from dandi.dandiapi import DandiAPIClient
from tqdm.notebook import tqdm
from isodate import parse_duration, Duration
from datetime import datetime
client = DandiAPIClient()
dandisets = list(client.get_dandisets())
More specific identification of NWB dandisets¶
The simpler tutorial only tested if the phrase "NWB" was in the name of any of the data standards for a dandiset.
The more official and precise method is to use the specific RRID of NWB, which is "RRID:SCR_015242"
.
nwb_dandisets = []
for dandiset in tqdm(dandisets):
raw_metadata = dandiset.get_raw_metadata()
if any(
data_standard['identifier'] == "RRID:SCR_015242" # this is the RRID for NWB
for data_standard in raw_metadata['assetsSummary'].get('dataStandard', [])
):
nwb_dandisets.append(dandiset)
print(f"There are currently {len(nwb_dandisets)} NWB datasets on DANDI!")
0%| | 0/223 [00:00<?, ?it/s]
There are currently 128 NWB datasets on DANDI!
Average age of subjects used in a dandiset¶
Let's consider a more advanced calculation - finding the average age of all the subjects used in a particular dandiset.
For this we will be directly accessing the asset level fields wasAttributedTo
as a key of the asset_metadata
, instead of as an attribute.
We will also have to do some manual data manipulation to parse the form of the ISO 8601.
def iso_to_fractional_days(age_iso: str, experiment_date: str) -> float:
"""
Defining a helper function which parses the ISO 8601 age and returns it in float-valued seconds.
This is because a dattetime.timedelta can only return either its `.days` (integer, rounded down) or
its `total_seconds()`.
This helper also resolves some complications that can arise in other datasets when the age is measured in years,
or if the the age is a range.
"""
if "/" in age_iso: # Some ages can be have upper and lower ranges due to uncertainty
return # Skip
age_duration = parse_duration(datestring=age_iso)
if isinstance(age_duration, Duration):
experiment_datetime = datetime.fromisoformat(experiment_date)
time_delta = age_duration.totimedelta(end=experiment_datetime)
else:
time_delta = age_duration
return time_delta.total_seconds() / ( # Evaluate using the total number of seconds
60 * # 60 seconds per minute
60 * # 60 minutes per hour
24 # 24 hours per day (ignoring daylight savings time)
)
all_subject_ages_in_days = []
dandiset = client.get_dandiset("000398")
assets = list(dandiset.get_assets())
for asset in tqdm(assets):
raw_metadata = asset.get_raw_metadata()
subjects = raw_metadata["wasAttributedTo"]
for subject_metadata in subjects:
if "age" in subject_metadata:
age_in_days = iso_to_fractional_days(
age_iso=subject_metadata["age"]["value"],
experiment_date=raw_metadata["wasGeneratedBy"][0]["startDate"]
)
if age_in_days: # Skip if the age is null
all_subject_ages_in_days.append(age_in_days)
print(f"The average age of the subjects in dandiset #398 is: {np.mean(all_subject_ages_in_days)} days")
0%| | 0/42 [00:00<?, ?it/s]
The average age of the subjects in dandiset #398 is: 170.74276620370375 days
Count the number of spiking units across all sessions in an experiment¶
The number of units identified from spike sorting is not something that DANDI extracts automatically during upload...
But we can calculate it ourselves without downloading an entire dandiset!
We do this by streaming directly from the archive, which requires us to retrieve the asset path on the S3 backend of the DANDI archive and then set the driver
argument to ros3
(Read-Only S3).
There are several ways to retrieve the S3 path, but the easiest is to use the NWB Inspector helper function nwbinspector.tools.get_s3_urls_and_dandi_paths
, which will format the path in the way ros3
expects.
from warnings import simplefilter
simplefilter("ignore") # Suppress namespace warnings from reading older NWB files
from nwbinspector.tools import get_s3_urls_and_dandi_paths
from pynwb import NWBHDF5IO
s3_urls = get_s3_urls_and_dandi_paths(dandiset_id="000059")
num_units_per_asset = dict()
for s3_url in tqdm(s3_urls):
io = NWBHDF5IO(path=s3_url, load_namespaces=True, driver="ros3")
nwbfile = io.read()
if nwbfile.units:
num_units_per_asset.update({s3_url: len(nwbfile.units)})
0%| | 0/54 [00:00<?, ?it/s]
num_units_per_asset
{'https://dandiarchive.s3.amazonaws.com/blobs/a8e/fd7/a8efd760-6dd4-485e-9d7f-6df493ae29e5': 395, 'https://dandiarchive.s3.amazonaws.com/blobs/4d1/98e/4d198e63-c6c2-4697-86ae-98637b57b45e': 381, 'https://dandiarchive.s3.amazonaws.com/blobs/ad2/bcf/ad2bcf13-79c2-4f05-a002-e08ff6fe3654': 365, 'https://dandiarchive.s3.amazonaws.com/blobs/a7e/22f/a7e22f26-0d53-4264-b9ae-dfab2ffa4864': 361, 'https://dandiarchive.s3.amazonaws.com/blobs/b0a/94a/b0a94a39-0460-4e84-a053-abe8b9ab12ac': 344, 'https://dandiarchive.s3.amazonaws.com/blobs/7ae/56a/7ae56abd-66db-4e61-b75a-8e815580f3ed': 409, 'https://dandiarchive.s3.amazonaws.com/blobs/5ce/588/5ce58887-9750-45a9-bfc8-7a8445acb49c': 915, 'https://dandiarchive.s3.amazonaws.com/blobs/6d2/1df/6d21df2b-b391-42e6-89ba-5b9113a80c86': 1114, 'https://dandiarchive.s3.amazonaws.com/blobs/b17/bfc/b17bfcce-2b08-40e1-bd73-0a603c750066': 468, 'https://dandiarchive.s3.amazonaws.com/blobs/034/6b4/0346b4b8-5ade-441c-9088-8fd9f9392d74': 1014, 'https://dandiarchive.s3.amazonaws.com/blobs/86a/36b/86a36bf9-dd7a-4e25-a3a5-cf6f4f3b821d': 584, 'https://dandiarchive.s3.amazonaws.com/blobs/1a5/403/1a540330-e889-436a-93dd-fdf5befc2905': 928, 'https://dandiarchive.s3.amazonaws.com/blobs/df6/633/df663372-04f6-4e6d-94db-7f09218bca4d': 854, 'https://dandiarchive.s3.amazonaws.com/blobs/d42/9cd/d429cd65-2630-4ddb-85ff-e1547a47c9b0': 1024, 'https://dandiarchive.s3.amazonaws.com/blobs/04f/c85/04fc8532-6df0-42f3-8e93-0260f4bee758': 1730, 'https://dandiarchive.s3.amazonaws.com/blobs/261/48c/26148caa-f3e7-4991-ad0b-9b5033f8d9b2': 1827, 'https://dandiarchive.s3.amazonaws.com/blobs/e41/7bd/e417bd22-effb-4f2d-8571-e7426f0c9bcd': 1384, 'https://dandiarchive.s3.amazonaws.com/blobs/62b/204/62b20403-6377-42d7-909e-5c6346e59572': 855, 'https://dandiarchive.s3.amazonaws.com/blobs/343/097/3430974d-387d-414f-b341-a108e2b793cf': 476, 'https://dandiarchive.s3.amazonaws.com/blobs/627/99c/62799cfd-e2f9-4f86-8924-b0b5b1c32cb0': 1509, 'https://dandiarchive.s3.amazonaws.com/blobs/eeb/bf6/eebbf65f-dc8e-4ebd-b4dc-b5d2b1ec88de': 1191, 'https://dandiarchive.s3.amazonaws.com/blobs/7ae/cb7/7aecb7dd-06d5-450e-b5ee-fd756b7c2371': 1183, 'https://dandiarchive.s3.amazonaws.com/blobs/f6d/638/f6d6384b-5a46-4a92-a30b-d5740c8aa36a': 1664, 'https://dandiarchive.s3.amazonaws.com/blobs/9c0/8a8/9c08a84a-7deb-4732-9338-92d7f1d30b52': 1344}
print(f"Dandiset #59 has a total of {sum(num_units_per_asset.values())} identified spiking units!")
Dandiset #59 has a total of 22319 identified spiking units!
Going beyond¶
These examples show a few types of queries, but since the metadata structures are quite rich on both the dandiset and asset levels, they enable many complex queries beyond the examples here.
These metadata structures are also expanding over time as DANDI becomes more strict about what counts as essential metadata.
The .get_raw_metadata
method of both client.get_dandiset(...)
and client.get_dandiset(...).get_assets()
provides a nice view into the available fields.
Note: for any attribute, it is recommended to first check that it is not None
before checking for its value.
print(json.dumps(assets[0].get_raw_metadata(), indent=4))
{ "id": "dandiasset:11c25674-6eff-43a8-8dba-7dea2e8c76c4", "path": "sub-San4/sub-San4_ses-20200302T142114_ecephys.nwb", "access": [ { "status": "dandi:OpenAccess", "schemaKey": "AccessRequirements" } ], "digest": { "dandi:sha2-256": "b770e3ac3f75f40618de2ba2a81e996429d5fc01dd530e9d826acc7a1ad0853c", "dandi:dandi-etag": "4c907ae8685aea1bfbe57316942b881f-4" }, "@context": "https://raw.githubusercontent.com/dandi/schema/master/releases/0.6.3/context.json", "approach": [ { "name": "electrophysiological approach", "schemaKey": "ApproachType" } ], "schemaKey": "Asset", "contentUrl": [ "https://api.dandiarchive.org/api/assets/11c25674-6eff-43a8-8dba-7dea2e8c76c4/download/", "https://dandiarchive.s3.amazonaws.com/blobs/429/baa/429baaad-a057-411d-8957-8460947aef73" ], "identifier": "11c25674-6eff-43a8-8dba-7dea2e8c76c4", "contentSize": 237267068, "publishedBy": { "id": "urn:uuid:f9fc60e7-3126-4912-8222-4de0c7d2cd7a", "name": "DANDI publish", "endDate": "2022-12-08T18:03:14.228709+00:00", "schemaKey": "PublishActivity", "startDate": "2022-12-08T18:03:14.228709+00:00", "wasAssociatedWith": [ { "id": "urn:uuid:2a080a04-80f8-4d25-835b-77aa87eb4b81", "name": "DANDI API", "version": "0.1.0", "schemaKey": "Software", "identifier": "RRID:SCR_017571" } ] }, "dateModified": "2022-12-02T20:15:00.997018-08:00", "datePublished": "2022-12-08T18:03:14.228709+00:00", "schemaVersion": "0.6.3", "encodingFormat": "application/x-nwb", "wasGeneratedBy": [ { "name": "Acquisition session", "schemaKey": "Session", "startDate": "2020-03-02T14:21:14-08:00", "description": "Fig 3i, S10" }, { "id": "urn:uuid:a698c09b-ca57-4cf3-a523-dfbb00a8f524", "name": "Metadata generation", "schemaKey": "Activity", "description": "Metadata generated by DANDI cli", "wasAssociatedWith": [ { "url": "https://github.com/dandi/dandi-cli", "name": "DANDI Command Line Interface", "version": "0.46.6", "schemaKey": "Software", "identifier": "RRID:SCR_019009" } ] } ], "wasAttributedTo": [ { "age": { "value": "P209DT55274S", "unitText": "ISO-8601 duration", "schemaKey": "PropertyValue", "valueReference": { "value": "dandi:BirthReference", "schemaKey": "PropertyValue" } }, "sex": { "name": "Male", "schemaKey": "SexType", "identifier": "http://purl.obolibrary.org/obo/PATO_0000384" }, "species": { "name": "Mus musculus - House mouse", "schemaKey": "SpeciesType", "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_10090" }, "genotype": "Emx1-Cre[tg/wt];Ai32[tg/wt]", "schemaKey": "Participant", "identifier": "San4" } ], "blobDateModified": "2022-12-02T17:21:54.708718-08:00", "variableMeasured": [ { "value": "ElectrodeGroup", "schemaKey": "PropertyValue" }, { "value": "ElectricalSeries", "schemaKey": "PropertyValue" } ], "measurementTechnique": [ { "name": "multi electrode extracellular electrophysiology recording technique", "schemaKey": "MeasurementTechniqueType" }, { "name": "surgical technique", "schemaKey": "MeasurementTechniqueType" } ] }