ideogram
Version:
Chromosome visualization for the web
151 lines (128 loc) • 4.82 kB
Plain Text
"""Converts cytogenetic band data from TSV to JSONP
Example:
cd python/scripts
python3 convert_band_data.py
NCBI provides cytogenetic band data as a TSV file.
This script parses those TSVs into JSONP, and assigns that to
a global variable in any HTML document that includes the output
file.
NCBI: #chromosome arm band iscn_start iscn_stop bp_start bp_stop stain density
ftp://ftp.ncbi.nlm.nih.gov/pub/gdp/ideogram_9606_GCF_000001305.14_550_V1
"""
import json
from os import walk
# Upstream NCBI ideogram files had the following naming pattern:
# * ideogram_$taxonomyID_$primaryAssemblyUnitAccession_$bandResolution_$version
#
# Transformed data files have been renamed to use the following pattern:
# * $scientificName-$assemblyAccession-$bandResolution.json
#
# The renaming improves human intelligibility, and allows Ideogram.js to
# bypass API calls to look up Taxonomy IDs and primary assembly unit
# acc.ver. The latter significantly improves performance for simple use cases.
#
# Some mappings contain two output filenames. This enables Ideogram.js to
# look up these local data files when assembly accession is specified in the
# Ideogram constructor
output_mappings = {
'ideogram_9606_GCF_000001305.14_850_V1': [
'homo-sapiens',
'homo-sapiens-GCF_000001405.26'
],
'ideogram_9606_GCF_000001305.14_550_V1': [
'homo-sapiens-550',
'homo-sapiens-GCF_000001405.26-550'
],
'ideogram_9606_GCF_000001305.14_400_V1': [
'homo-sapiens-400',
'homo-sapiens-GCF_000001405.26-400'
],
'ideogram_9606_GCF_000001305.13_850_V1': [
'homo-sapiens-GCF_000001405.13',
'homo-sapiens-GCF_000001405.13-850'
],
'ideogram_9606_GCF_000001305.13_550_V1': [
'homo-sapiens-GCF_000001405.13-550'
],
'ideogram_9606_GCF_000001305.12_1200_V1': [
'homo-sapiens-GCF_000001405.12-1200'
],
'ideogram_9606_GCF_000001305.12_850_V1': [
'homo-sapiens-GCF_000001405.12',
'homo-sapiens-GCF_000001405.12-850'
],
'ideogram_9606_GCF_000001305.12_550_V1': [
'homo-sapiens-GCF_000001405.12-550'
],
'ideogram_9606_GCF_000001305.12_400_V1': [
'homo-sapiens-GCF_000001405.12-400'
],
'ideogram_10090_GCF_000000055.19_NA_V2': [
'mus-musculus',
'mus-musculus-GCF_000001635.20'
],
'ideogram_10116_GCF_000000225.4_NA_V1': [
'rattus-norvegicus',
'rattus-norvegicus-GCF_000001895.5'
]
}
def main():
in_dir = '../../data/bands/ncbi/'
out_dir = '../../data/bands/native/'
f = []
for (dirpath, dirnames, filenames) in walk(in_dir):
print(filenames)
f.extend(filenames)
break
for input_file in f:
if input_file[-3:] != 'tsv':
# Skip e.g. README.md
continue
output = []
fn = input_file.split('.tsv')[0]
if fn in output_mappings:
fn = output_mappings[fn]
else:
# e.g. "banana.tsv" -- for pre-accessioned assemblies
fn = [fn]
output_file = out_dir + fn[0] + '.json'
rows = open(in_dir + input_file, 'r').readlines()
if len(rows[0].split("\t")) == 4:
# e.g. ../data/bands/ncbi/ideogram_banana_v0.1.tsv
#chromosome arm bp_start bp_stop
max_chr_length = 0
for row in rows[1:]:
columns = row.split('\t')
bp_stop = int(columns[3])
if bp_stop > max_chr_length:
max_chr_length = bp_stop
for row in rows[1:]:
columns = row.split('\t')
chr = columns[0]
arm = columns[1]
band = '1'
bp_start = int(columns[2])
bp_stop = int(columns[3])
iscn_start = '0'
iscn_stop = str(round(bp_stop - bp_start) / max_chr_length * 10000)
columns = [
chr, arm, band,
iscn_start, iscn_stop,
str(bp_start), str(bp_stop)
]
output.append(" ".join(columns))
else:
# e.g. ../data/bands/ncbi/ideogram_9606_GCF_000001305.13_550_V1
# #chromosome arm band iscn_start iscn_stop bp_start bp_stop stain density
for row in rows[1:]:
columns = row.replace('\n', '').replace('\t', ' ')
output.append(columns)
output = {'chrBands': output}
output = json.dumps(output)
open(output_file, 'w').write(output)
# Write assembly-specific name, if default omits assembly
if len(fn) > 1:
output_file_2 = out_dir + fn[1] + '.json'
open(output_file_2, 'w').write(output)
if __name__ == '__main__':
main()