UNPKG

ideogram

Version:

Chromosome visualization for the web

241 lines (179 loc) 7.12 kB
'''Write TSV file of high-quality genome assemblies by organism Output file has one row per assembly, with columns for: * Organism scientific name (e.g. Homo sapiens, Macaca fascicularis) * Organism common name (e.g. human, crab-eating macaque) * Assembly name (e.g. GRCh38, Macaca_fascicularis_5.0) * Assembly accession (e.g. GCA_000001405.28, GCA_000364345.1) EXAMPLES python3 list_assemblies.py ''' import argparse import json import math import os import urllib.request as request parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--output-dir', help='Directory to send output data to', default='') args = parser.parse_args() output_dir = args.output_dir if len(output_dir) > 0 and output_dir[0] != '': output_dir = '/' + output_dir asms_path = 'assembly_summary.txt' asms_path_historical = 'assembly_summary_historical.txt' eutils_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils' esummary_base = eutils_base + '/esummary.fcgi' def fetch_asm_summary(group, group_asms_path, group_asms_historical_path): '''Retrieve NCBI assembly summary TSV file by organism group Fetched files are written locally as a cache ''' path_versions = { 'current': group_asms_path, 'historical': group_asms_historical_path, } for version in path_versions: path = path_versions[version] if version is 'current': leaf = f'{group}/{asms_path}' else: leaf = f'{group}/{asms_path_historical}' if os.path.exists(path) == False: # Example URL: # https://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/assembly_summary.txt url = f'https://ftp.ncbi.nlm.nih.gov/genomes/genbank/{leaf}' with request.urlopen(url) as response: data = response.read().decode('utf-8') with open(path, 'w') as f: f.write(data) def get_taxid_chunks(taxids): '''Return taxids in comma-delimited lists of 500 Needed because NCBI EUtils limits parameters to 500 values each. ''' taxid_chunks = [] eutils_limit = 500 num_requests = math.ceil(len(taxids) / eutils_limit) for num in range(0, num_requests): pos = num * eutils_limit upper = (num + 1) * eutils_limit if upper < len(taxids): size = upper else: size = pos + upper - len(taxids) taxids_segment = taxids[pos:size] taxids_chunk = ','.join(taxids_segment) taxid_chunks.append(taxids_chunk) return taxid_chunks def add_common_names(asms): '''Add organism common names (e.g. human) to genome assembly objects ''' # NCBI Taxonomy identifiers, an organism ID. Human: 9606, etc. taxids = [asm['taxid'] for asm in asms] taxids = list(set(taxids)) # deduplicate names_by_taxid = {} taxid_chunks = get_taxid_chunks(taxids) for taxid_chunk in taxid_chunks: eutils_url = f'{esummary_base}?db=taxonomy&format=json&id={taxid_chunk}' with request.urlopen(eutils_url) as response: data = json.loads(response.read().decode('utf-8')) taxid_json = data['result'] del taxid_json['uids'] # Remove placeholder from relevant data for taxid in taxid_json: names_by_taxid[taxid] = taxid_json[taxid]['commonname'] new_asms = [] for asm in asms: new_asm = asm new_asm['organism_common_name'] = names_by_taxid[asm['taxid']] new_asms.append(new_asm) return new_asms def parse_current_assemblies(group, group_asms_path): ''' Parse current genome assemblies ''' asms = [] with open(group_asms_path) as f: asms_file = f.readlines() # For reference, expected headers are: # assembly_accession, bioproject, biosample, wgs_master, refseq_category, # taxid, species_taxid, organism_name, infraspecific_name, isolate, # version_status, assembly_level, release_type, genome_rep, seq_rel_date, # asm_name, submitter, gbrs_paired_asm, paired_asm_comp, ftp_path, # excluded_from_refseq, relation_to_type_material headers = asms_file[1].replace('# ', '').strip().split('\t') for line in asms_file[2:]: columns = line.strip().split('\t') asm = dict(zip(headers, columns)) categories = ['representative genome', 'reference genome'] if ( asm['assembly_level'] != 'Chromosome' or asm['release_type'] != 'Major' or asm['refseq_category'] not in categories ): continue asm['organism_group'] = group asms.append(asm) return asms def parse_historical_assemblies(group, group_asms_historical_path): asms = [] with open(group_asms_historical_path) as f: asms_file = f.readlines() # See parse_current_assemblies for expected headers headers = asms_file[1].replace('# ', '').strip().split('\t') for line in asms_file[2:]: columns = line.strip().split('\t') asm = dict(zip(headers, columns)) if ( asm['assembly_level'] != 'Chromosome' or asm['release_type'] != 'Major' or asm['submitter'] != 'Genome Reference Consortium' ): continue asm['organism_group'] = group asms.append(asm) return asms def get_assemblies(): '''Fetch metadata on genome assemblies from NCBI, and write it locally Background: Genome assemblies are sequenced chromosomes for an organism. Here, we get metadata on each assembly, like its name (e.g. GRCh38), accession (an identifier like GCA_000001405.15), the organism's scientific and common names (Homo sapiens, human) and more. ''' groups = [ 'fungi', 'invertebrate', 'plant', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other' ] asms = [] for group in groups: group_asms = [] group_asms_path = f'{group}_{asms_path}' group_asms_historical_path = f'{group}_{asms_path_historical}' fetch_asm_summary(group, group_asms_path, group_asms_historical_path) group_asms += parse_current_assemblies(group, group_asms_path) group_asms +=\ parse_historical_assemblies(group, group_asms_historical_path) group_asms = sorted(group_asms, key=lambda asm: asm['organism_name']) asms += group_asms asms = add_common_names(asms) return asms asms = get_assemblies() asm_list = [] output_headers = [ 'organism_name', 'organism_common_name', 'asm_name', 'assembly_accession' ] for asm in asms: asm_entry = [asm[header] for header in output_headers] asm_list.append('\t'.join(asm_entry)) header = '# ' + '\t'.join(output_headers) asm_list.insert(0, header) content = '\n'.join(asm_list) output_file = output_dir + 'assemblies.tsv' open(output_file, 'w').write(content) print(f'Wrote list of {len(asm_list)} assemblies to {output_file}')