UNPKG

ideogram

Version:

Chromosome visualization for the web

189 lines (157 loc) 6.01 kB
""" Creates simulated genome annotation data Data is currently simulated single-nucleotide variations (SNVs). Examples: # Create 1000 annots in 3 tracks with 5% in 1st track, 80% in 2nd, 15% in 3rd python3 create_annots.py --track_annot_percents 5 80 15 # Create 2500 annots in 10 tracks and include track metadata python3 create_annots.py --num_annots 2500 --num_tracks 7 --include_metadata # Create 90000 annots evenly distributed among 3 tracks python3 create_annots.py --num_annots 90000 --num_tracks 5 """ # TODO: # - Add handling for non-human organisms # - Enhance with more data than simply position, e.g.: # - Variant type (use Sequence Ontology ID) # - Molecular consequence (use SO ID) # - Clinical significance # - Transcript accession # - HGVS expression import argparse import json import math import random parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--output_dir', help='Directory to send output data to', default='../../data/annotations/') parser.add_argument('--num_annots', help='Number of annotations to create', type=int, default=1000) parser.add_argument('--assembly', help='Genome assembly reference to use: GRCh38 or GRCh37', default='GRCh38') parser.add_argument('--num_tracks', help='Number of annotation tracks', type=int, default=3) parser.add_argument('--include_metadata', help='Whether to include metadata about the track set', action='store_true' ) parser.add_argument('--track_annot_percents', help=( 'Percentage of total annotations in each track, e.g. ' + '5,80,15 for 5%% in 1st track, 80%% in 2nd, 15%% in 3rd. ' + 'Defaults to even distribution of annots among tracks.' ), metavar='int', type=int, nargs='*') parser.add_argument('--density', help=( 'Whether annotations are "dense" or "sparse". ' + 'Dense is where each genomic feature (e.g. each ' + 'gene) has annotations on all tracks. Sparse is ' + 'where each feature has an annotation on only one track.' ), default='sparse') args = parser.parse_args() output_dir = args.output_dir num_annots = args.num_annots assembly = args.assembly num_tracks = args.num_tracks include_metadata = args.include_metadata track_annot_percents = args.track_annot_percents density = args.density track_index_pool = [] if track_annot_percents is None: track_annot_percents = [] for i in range(0, num_tracks): track_annot_percents.append(math.ceil(100/num_tracks)) for i, track_annot_percent in enumerate(track_annot_percents): track_index_pool += [i]*track_annot_percent annots = [] alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' chrs = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y' ] lengths_GRCh38 = { '1': 248956422, '2': 242193529, '3': 198295559, '4': 190214555, '5': 181538259, '6': 170805979, '7': 159345973, '8': 145138636, '9': 138394717, '10': 133797422, '11': 135086622, '12': 133275309, '13': 114364328, '14': 107043718, '15': 101991189, '16': 90338345, '17': 83257441, '18': 80373285, '19': 58617616, '20': 64444167, '21': 46709983, '22': 50818468, 'X': 156040895, 'Y': 57227415 } lengths_GRCh37 = { '1': 249250621, '2': 243199373, '3': 198022430, '4': 191154276, '5': 180915260, '6': 171115067, '7': 159138663, '8': 146364022, '9': 141213431, '10': 135534747, '11': 135006516, '12': 133851895, '13': 115169878, '14': 107349540, '15': 102531392, '16': 90354753, '17': 81195210, '18': 78077248, '19': 59128983, '20': 63025520, '21': 48129895, '22': 51304566, 'X': 155270560, 'Y': 59373566 } if assembly == 'GRCh38': chr_lengths = lengths_GRCh38 else: chr_lengths = lengths_GRCh37 for chr in chrs: annots.append({'chr': chr, 'annots': []}) for i in range(0, num_annots): j = str(i + 1) chr = i % 24 chr_length = chr_lengths[chrs[chr]] # Distribute annotations evenly across this chromosome start = int((i * chr_length)/num_annots + 1) length = 0 annot = [ 'rs' + j, start, length ] if density == 'sparse': random_index = random.randrange(0, 99) track_index = track_index_pool[random_index] annot.append(track_index) else: for k in range(0, num_tracks): annot_value = random.randrange(0, 99) annot.append(annot_value) annots[chr]['annots'].append(annot) if density == 'sparse': track_keys = ['trackIndex'] else: track_keys = [] for i in range(0, num_tracks): track_keys.append('track_' + str(i + 1)) top_annots = {} if include_metadata: top_annots['metadata'] = { 'species': 'human', 'assembly': assembly, 'numTracks': num_tracks, 'numAnnots': num_annots } trackLabels = [] for i in range(num_tracks): trackLabels.append('Sample ' + alphabet[i]) top_annots['metadata']['trackLabels'] = trackLabels top_annots['keys'] = ['name', 'start', 'length'] + track_keys top_annots['annots'] = annots annots = json.dumps(top_annots) num_annots = str(num_annots) output_path = output_dir + num_annots + '_virtual_snvs.json' open(output_path, 'w').write(annots) print( 'Output ' + num_annots + ' ' + density + ' annotations ' + 'on ' + str(num_tracks) + ' tracks ' + 'on assembly ' + assembly + ' ' + 'to ' + output_path )