ideogram
Version:
Chromosome visualization for the web
258 lines (214 loc) • 8.19 kB
Plain Text
"""Fetch cytogenetic band data from remote MySQL databases
"""
# TODO:
# - Bonus: Convert this data into AGP 2.0, send data missing from NCBI to them
import os
import json
from concurrent.futures import ThreadPoolExecutor
import argparse
from . import settings
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--output_dir',
help='Directory to send output data to',
default='../../data/bands/native/')
# parser.add_argument('--fresh_run',
# help='Do you want to use cached data, or fresh data fetched over ' +
# 'the Internet?',
# default='True')
# parser.add_argument('--fill_cache',
# help='Do you want to populate the cache? Only applicable for fresh runs.',
# default='False')
args = parser.parse_args()
def t_or_f(arg):
ua = str(arg).upper()
if 'TRUE'.startswith(ua):
return True
elif 'FALSE'.startswith(ua):
return False
else:
pass #error condition maybe?
# eweitz, 2017-12-01:
# The arguments '--fresh_run=False and --fresh_run=False' do not yet work.
# The code related to these arguments is a work in progress.
# They are intended to speed up development by enabling runs to
# bypass remote querying and download.
fresh_run = True # t_or_f(args.fresh_run)
fill_cache = False #t_or_f(args.fill_cache)
output_dir = args.output_dir
cache_dir = output_dir + 'cache/'
log_name = 'fetch_cytobands_from_dbs'
from . import settings
logger = settings.init(fresh_run, fill_cache, output_dir, cache_dir, log_name)
from .utils import *
from .ucsc import *
from .ensembl import *
from .genomaize import *
from .centromeres import *
times = {'ncbi': 0, 'ucsc': 0, 'ensembl': 0}
unfound_dbs = []
maize_centromeres = {}
if os.path.exists(output_dir) is False:
os.mkdir(output_dir)
# Caching scenarios
#
# | fresh_run | True | True | False | False |
# | fill_cache | True | False | True | False |
# | Scenario | A | B | C | D |
#
# Scenario A: Repopulate cache. Slow run, prepare later cache.
# Scenario B: For production. Slow run, don't write to cache.
# Scenario C: No-op. Illogical state, throw error.
# Scenario D: For development, or debugging. Fast run, usable offline.
#
# Scenario D can be useful when working without Internet access, e.g. on a
# train or in rural areas. It also enables much faster iteration even when
# connectivity is good. Be sure to run Scenario A first, though!
if fresh_run is False and fill_cache:
raise ValueError(
'Error: Attempting to use cache, but no cache exists. ' +
'Use other arguments, e.g. "--fill_cache=True --fill_cache=True".'
)
if os.path.exists(cache_dir) is False:
if fill_cache:
os.mkdir(cache_dir)
if fresh_run is False:
raise ValueError(
'No cache available. ' +
'Run with "--fresh_run=True --fill_cache=True" then try again.'
)
def patch_telomeres(bands_by_chr):
"""Account for special case with Drosophila melanogaster
TODO: Per https://github.com/eweitz/ideogram/issues/189, excise these lines:
* (1) "X q pter 1 10976635 1 10976635 gpos"
* (2) "2R q pter 1 4460914 1 4460914 gpos"
* (3) "3R q pter 1 30597427 1 30597427 gpos"
And report data problem to NCBI.
Before reporting bug, investigate possibility that above 'pter' issues could
be due to an erroneous start coordinate of 1. Look at linked GitHub issue;
the large black band ends near where one could envision a centromere.
"""
for chr in bands_by_chr:
first_band = bands_by_chr[chr][0]
start = first_band[1]
if start != '1':
stop = str(int(start) - 1)
pter_band = ['pter', '1', stop, '1', stop, 'gpos']
bands_by_chr[chr].insert(0, pter_band)
new_bands = {}
for chr in bands_by_chr:
new_bands[chr] = []
for band in bands_by_chr[chr]:
band.insert(0, 'q')
new_bands[chr].append(band)
bands_by_chr = new_bands
return bands_by_chr
def pool_processing(party):
"""Called once per "party" (i.e. UCSC, Ensembl, or GenoMaize)
to fetch cytoband data from each.
"""
global unfound_dbs
global times
logger.info('Entering pool processing, party: ' + party)
if party == 'ensembl':
org_map, times = fetch_from_ensembl_genomes(times, logger)
elif party == 'ucsc':
org_map, times, unfound_dbs_subset =\
fetch_from_ucsc(logger, times, unfound_dbs)
unfound_dbs += unfound_dbs_subset
elif party == 'genomaize':
org_map = fetch_maize_centromeres(output_dir)
logger.info('exiting pool processing')
return [party, org_map, times]
def log_end_times(times):
""" How long did each part take?
"""
logger.info('')
logger.info('time ucsc:')
logger.info(times['ucsc'])
logger.info('time ncbi:')
logger.info(times['ncbi'])
logger.info('time ensembl:')
logger.info(times['ensembl'])
def get_nonredundant_organisms(asm_data_list):
""" Third parties (e.g. UCSC) can have data for the same organism.
Convert any such duplicate data into a non-redundant (NR) organism map.
"""
global times
nr_org_map = {}
seen_orgs = {}
for party, org_map, party_times in asm_data_list:
logger.info('Iterating organisms from ' + party)
times[party] = party_times
for org in org_map:
logger.info('\t' + org)
if org in seen_orgs:
logger.info('Already saw ' + org)
continue
nr_org_map[org] = org_map[org]
return nr_org_map
def refine_bands(org, bands_by_chr, maize_centromeres):
""" Adjust telomeres and centromeres as needed in each organism
"""
if org == 'drosophila-melanogaster':
bands_by_chr = patch_telomeres(bands_by_chr)
# Assign cytogenetic arms for each band
if org == 'zea-mays':
bands_by_chr =\
merge_centromeres(bands_by_chr, maize_centromeres, logger)
else:
bands_by_chr = parse_centromeres(bands_by_chr, logger)
return bands_by_chr
def write_chr_bands(org, nr_org_map, maize_centromeres):
""" Write chromosome cytoband data to a file on disk
"""
asm_data = sorted(nr_org_map[org], reverse=True)[0]
genbank_accession, db, bands_by_chr = asm_data
bands_by_chr = refine_bands(org, bands_by_chr, maize_centromeres)
# Collapse chromosome-to-band dict, making it a list of strings
band_list = []
chrs = natural_sort(list(bands_by_chr.keys()))
for chr in chrs:
bands = bands_by_chr[chr]
for band in bands:
if band is None:
continue
band_list.append(chr + ' ' + ' '.join(band))
# Write actual cytoband data to file,
# e.g. ../data/bands/native/anopheles-gambiae.json
with open(output_dir + org + '.json', 'w') as f:
output = {'chrBands': band_list}
output = json.dumps(output)
f.write(output)
return [genbank_accession, db]
def fetch_parties():
""" Request cytoband data from all relevant institutes, simultaneously
"""
parties = []
num_threads = 3
with ThreadPoolExecutor(max_workers=num_threads) as pool:
print ('in fetch_cytobands_from_dbs, main')
party_list = ['ensembl', 'ucsc', 'genomaize']
for result in pool.map(pool_processing, party_list):
party = result[0]
if party == 'genomaize':
maize_centromeres = result[1]
else:
parties.append(result)
return parties, maize_centromeres
def main():
global unfound_dbs
party_list, maize_centromeres = fetch_parties()
logger.info('')
logger.info('UCSC databases not mapped to GenBank assembly IDs:')
logger.info(', '.join(unfound_dbs))
logger.info('')
nr_org_map = get_nonredundant_organisms(party_list)
manifest = {}
for org in nr_org_map:
entry = write_chr_bands(org, nr_org_map, maize_centromeres)
manifest[org] = entry
return manifest
if __name__ == '__main__':
main()