Source code for bids.layout.validation

"""Tools for validating BIDS projects."""

import re
import json
from os.path import join, abspath, dirname
import pandas as pd

__all__ = ['BIDSValidator']


[docs]class BIDSValidator(): """An object for BIDS (Brain Imaging Data Structure) verification in a data. The main method of this class is `is_bids()`. You should use it for checking whether a file path compatible with BIDS. Parameters ---------- index_associated : bool, default: True Specifies if an associated data should be checked. If it is true then any file paths in directories `code/`, `derivatives/`, `sourcedata/` and `stimuli/` will pass the validation, else they won't. Examples -------- >>> from bids.layout import BIDSValidator >>> validator = BIDSValidator() >>> filepaths = ["/sub-01/anat/sub-01_rec-CSD_T1w.nii.gz", >>> "/sub-01/anat/sub-01_acq-23_rec-CSD_T1w.exe", #wrong extension >>> "/participants.tsv"] >>> for filepath in filepaths: >>> print( validator.is_bids(filepath) ) True False True """ def __init__(self, index_associated=True): self.rule_dir = join(dirname(abspath(__file__)),'config', 'validator') self.index_associated = index_associated
[docs] def is_bids(self, path): """Check if a file path appropriate for BIDS. Main method of the validator. uses other class methods for checking different aspects of the file path. Parameters ---------- path: string A path of a file you want to check. Examples -------- >>> from bids.layout import BIDSValidator >>> validator = BIDSValidator() >>> validator.is_bids("/sub-01/ses-test/anat/sub-01_ses-test_rec-CSD_run-23_T1w.nii.gz") True >>> validator.is_bids("/sub-01/ses-test/sub-01_run-01_dwi.bvec") # missed session in the filename False """ conditions = [] conditions.append(self.is_top_level(path)) conditions.append(self.is_associated_data(path)) conditions.append(self.is_session_level(path)) conditions.append(self.is_subject_level(path)) conditions.append(self.is_phenotypic(path)) conditions.append(self.is_file(path)) return (any(conditions))
[docs] def is_top_level(self, path): """Check if the file has appropriate name for a top-level file.""" with open(join(self.rule_dir, 'fixed_top_level_names.json'), 'r') as f: fixed_top_level_json = json.load(f) fixed_top_level_names = fixed_top_level_json['fixed_top_level_names'] regexps = self.get_regular_expressions('top_level_rules.json') conditions = [False if re.compile(x).search(path) is None else True for x in regexps] conditions.append(path in fixed_top_level_names) return (any(conditions))
[docs] def is_associated_data(self, path): """Check if file is appropriate associated data.""" if not self.index_associated: return False regexps = self.get_regular_expressions('associated_data_rules.json') conditions = [(re.compile(x).search(path) is not None) for x in regexps] return any(conditions)
[docs] def is_session_level(self, path): """Check if the file has appropriate name for a session level.""" regexps = self.get_regular_expressions('session_level_rules.json') conditions = [self.conditional_match(x, path) for x in regexps] return (any(conditions))
[docs] def is_subject_level(self, path): """Check if the file has appropriate name for a subject level.""" regexps = self.get_regular_expressions('subject_level_rules.json') conditions = [(re.compile(x).search(path) is not None) for x in regexps] return (any(conditions))
[docs] def is_phenotypic(self, path): """Check if file is phenotypic data.""" regexps = self.get_regular_expressions('phenotypic_rules.json') conditions = [(re.compile(x).search(path) is not None) for x in regexps] return (any(conditions))
[docs] def is_file(self, path): """Check if file is phenotypic data.""" regexps = self.get_regular_expressions('file_level_rules.json') conditions = [(re.compile(x).search(path) is not None) for x in regexps] return (any(conditions))
[docs] def get_regular_expressions(self, filename): """Get regular expressions from file.""" regexps = [] filename = join(self.rule_dir, filename) with open(filename, 'r') as f: rules = json.load(f) for key in list(rules.keys()): rule = rules[key] regexp = rule["regexp"] if "tokens" in rule: tokens = rule["tokens"] for token in list(tokens): regexp = regexp.replace(token, "|".join(tokens[token])) regexps.append(regexp) return regexps
[docs] def get_path_values(self, path): """Takes a file path and returns values found for the following path keys: sub- ses- """ values = {} regexps = self.get_regular_expressions('path.json') # capture subject for paths in ['sub', 'ses']: match = re.compile(regexps[paths]).findall(path) values[paths] = match[1] if match & match[1] else None return values
def conditional_match(self, expression, path): match = re.compile(expression).findall(path) match = match[0] if len(match) >= 1 else False # adapted from JS code and JS does not support conditional groups if (match): return ((match[1] == match[2][1:]) | (not match[1])) else: return False
def validate_sequences(layout, config): """Checks files in BIDS project match user defined expectations. This method is a wrapper for the check_duplicate_files() and check_expected_files() methods. Use it to check whether there are files with duplicate content within the BIDS data set and to check the number of data set files against a user customized configuration file. Returns a named tuple of three data frames: duplicates, summary, and problems. Parameters ---------- layout: BIDSLayout class A BIDSLayout path of a data set. config: string Path to customized configuration file. Requires `runs` as an input. See the sample config for an example (bids/layout/tests/data/sample_validation_config.json). Examples -------- >>> layout = bids.grabbids.BIDSLayout('/path_to/sample_project_root') >>> dfs = validate_sequences(layout, 'pybids/bids/layout/tests/data/sample_validation_config.json') >>> dfs.duplicates # Put example output here >>> df.summary # Put example output here >>> df.problems # Put example output here """ duplicate_file_df = check_duplicate_files(layout) summary_df, problem_df = check_expected_files(layout, config) output = namedtuple('output', ['duplicates', 'summary', 'problems']) return output(duplicate_file_df, summary_df, problem_df) def check_duplicate_files(layout): """Checks images in BIDS project are not duplicated. Check whether any files have duplicate content within the BIDS data set. Returns a data frame: duplicate_file_df. Parameters ---------- layout: BIDSLayout class A BIDSLayout path of a data set. Examples -------- >>> layout = bids.grabbids.BIDSLayout('/path_to/sample_project_root') >>> duplicate_file_df = check_duplicate_files(layout) >>> duplicate_file_df # Put example output here Notes ------ Returns a data frame in which the first column is the file identifier and the second column is the path to the file. Files with matching identifiers have the same content. """ def md5(fname): hash_md5 = hashlib.md5() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() hash_map = {} all_niftis = layout.get(return_type="file", extensions='.nii.gz') for nifti_file in all_niftis: md5sum = md5(nifti_file) if md5sum in hash_map: hash_map[md5sum].append(nifti_file) else: hash_map[md5sum] = [nifti_file] df = pd.DataFrame.from_dict(hash_map, orient='index') pruned_df = df.stack().reset_index().drop(columns='level_1') out_df = pruned_df.rename(columns={'level_0': 'hash', 0: 'filename'}) return out_df def check_expected_files(layout, config): """Checks files in BIDS project match user defined expectations. This method checks the number of data set files against a user customized configuration file. Returns two data frames: summary_df, problem_df. Parameters ---------- layout: BIDSLayout class A BIDSLayout path of a data set. config: string Path to customized configuration file. Examples -------- >>> layout = bids.grabbids.BIDSLayout('/path_to/sample_project_root') >>> summary_df, problem_df = check_expected_files(layout, 'pybids/bids/layout/tests/data/sample_validation_config.json') >>> summary_df # Put example output here >>> problem_df # Put example output here Notes -------- `runs` is a mandatory field in the config file. The configuration file can take any keys that are valid arguments for pybids `layout.get()` Values shoud match those in the BIDS file names. See the sample config for an example (bids/layout/tests/data/sample_validation_config.json). The more specific keys are provided, the more informative the output will be. """ dictlist = [] with open(config) as f: json_data = json.load(f) subjects = layout.get_subjects() for sub in subjects: for scan_params_d in json_data['sequences']: scan_params = scan_params_d.copy() seq_params = {i: scan_params[i] for i in scan_params if i != 'runs'} actual_runs = layout.get(return_type='obj', subject=sub, extensions='.nii.gz', **seq_params) scan_params['subject'] = sub scan_params['runs_found'] = len(actual_runs) scan_params['problem'] = len(actual_runs) != scan_params['runs'] dictlist.append(scan_params) summary_df = pd.DataFrame(dictlist) problem_df = summary_df.loc[summary_df['problem'] == True] return summary_df, problem_df