UNPKG

ideogram

Version:

Chromosome visualization for the web

151 lines (128 loc) 4.82 kB
"""Converts cytogenetic band data from TSV to JSONP Example: cd python/scripts python3 convert_band_data.py NCBI provides cytogenetic band data as a TSV file. This script parses those TSVs into JSONP, and assigns that to a global variable in any HTML document that includes the output file. NCBI: #chromosome arm band iscn_start iscn_stop bp_start bp_stop stain density ftp://ftp.ncbi.nlm.nih.gov/pub/gdp/ideogram_9606_GCF_000001305.14_550_V1 """ import json from os import walk # Upstream NCBI ideogram files had the following naming pattern: # * ideogram_$taxonomyID_$primaryAssemblyUnitAccession_$bandResolution_$version # # Transformed data files have been renamed to use the following pattern: # * $scientificName-$assemblyAccession-$bandResolution.json # # The renaming improves human intelligibility, and allows Ideogram.js to # bypass API calls to look up Taxonomy IDs and primary assembly unit # acc.ver. The latter significantly improves performance for simple use cases. # # Some mappings contain two output filenames. This enables Ideogram.js to # look up these local data files when assembly accession is specified in the # Ideogram constructor output_mappings = { 'ideogram_9606_GCF_000001305.14_850_V1': [ 'homo-sapiens', 'homo-sapiens-GCF_000001405.26' ], 'ideogram_9606_GCF_000001305.14_550_V1': [ 'homo-sapiens-550', 'homo-sapiens-GCF_000001405.26-550' ], 'ideogram_9606_GCF_000001305.14_400_V1': [ 'homo-sapiens-400', 'homo-sapiens-GCF_000001405.26-400' ], 'ideogram_9606_GCF_000001305.13_850_V1': [ 'homo-sapiens-GCF_000001405.13', 'homo-sapiens-GCF_000001405.13-850' ], 'ideogram_9606_GCF_000001305.13_550_V1': [ 'homo-sapiens-GCF_000001405.13-550' ], 'ideogram_9606_GCF_000001305.12_1200_V1': [ 'homo-sapiens-GCF_000001405.12-1200' ], 'ideogram_9606_GCF_000001305.12_850_V1': [ 'homo-sapiens-GCF_000001405.12', 'homo-sapiens-GCF_000001405.12-850' ], 'ideogram_9606_GCF_000001305.12_550_V1': [ 'homo-sapiens-GCF_000001405.12-550' ], 'ideogram_9606_GCF_000001305.12_400_V1': [ 'homo-sapiens-GCF_000001405.12-400' ], 'ideogram_10090_GCF_000000055.19_NA_V2': [ 'mus-musculus', 'mus-musculus-GCF_000001635.20' ], 'ideogram_10116_GCF_000000225.4_NA_V1': [ 'rattus-norvegicus', 'rattus-norvegicus-GCF_000001895.5' ] } def main(): in_dir = '../../data/bands/ncbi/' out_dir = '../../data/bands/native/' f = [] for (dirpath, dirnames, filenames) in walk(in_dir): print(filenames) f.extend(filenames) break for input_file in f: if input_file[-3:] != 'tsv': # Skip e.g. README.md continue output = [] fn = input_file.split('.tsv')[0] if fn in output_mappings: fn = output_mappings[fn] else: # e.g. "banana.tsv" -- for pre-accessioned assemblies fn = [fn] output_file = out_dir + fn[0] + '.json' rows = open(in_dir + input_file, 'r').readlines() if len(rows[0].split("\t")) == 4: # e.g. ../data/bands/ncbi/ideogram_banana_v0.1.tsv #chromosome arm bp_start bp_stop max_chr_length = 0 for row in rows[1:]: columns = row.split('\t') bp_stop = int(columns[3]) if bp_stop > max_chr_length: max_chr_length = bp_stop for row in rows[1:]: columns = row.split('\t') chr = columns[0] arm = columns[1] band = '1' bp_start = int(columns[2]) bp_stop = int(columns[3]) iscn_start = '0' iscn_stop = str(round(bp_stop - bp_start) / max_chr_length * 10000) columns = [ chr, arm, band, iscn_start, iscn_stop, str(bp_start), str(bp_stop) ] output.append(" ".join(columns)) else: # e.g. ../data/bands/ncbi/ideogram_9606_GCF_000001305.13_550_V1 # #chromosome arm band iscn_start iscn_stop bp_start bp_stop stain density for row in rows[1:]: columns = row.replace('\n', '').replace('\t', ' ') output.append(columns) output = {'chrBands': output} output = json.dumps(output) open(output_file, 'w').write(output) # Write assembly-specific name, if default omits assembly if len(fn) > 1: output_file_2 = out_dir + fn[1] + '.json' open(output_file_2, 'w').write(output) if __name__ == '__main__': main()