ideogram
Version:
Chromosome visualization for the web
413 lines (307 loc) • 12.8 kB
Plain Text
"""Download AGPs from NCBI and format chromosome data, including centromeres"""
import urllib.request as request
from urllib.parse import quote
import ftplib
import os
import json
import gzip
import io
from concurrent.futures import ThreadPoolExecutor
import time
import pprint
import fetch_chromosomes.settings as settings
import fetch_chromosomes.convert_band_data as convert_band_data
import fetch_chromosomes.fetch_cytobands_from_dbs as fetch_cytobands_from_dbs
import fetch_chromosomes.utils as utils
output_dir = '../../data/bands/native/'
logger = settings.get_logger(output_dir, 'get_chromosomes')
if os.path.exists(output_dir) == False:
os.mkdir(output_dir)
orgs_with_centromere_data = {}
ftp_domain = 'ftp.ncbi.nlm.nih.gov'
manifest = {}
def get_chromosome_object(agp):
"""Extracts centromere coordinates and chromosome length from AGP data,
and returns a chromosome object formatted in JSON"""
chr = {}
agp = agp.split('\n')
for i, line in enumerate(agp):
if len(line) == 0 or line[0] == '#':
continue
tabs = line.split("\t")
acc = tabs[0]
start = int(tabs[1])
stop = int(tabs[2])
comp_type = tabs[6]
if 'acc' not in chr:
chr['accession'] = acc
chr['type'] = 'nuclear'
if comp_type == 'centromere':
chr['centromere'] = {
'start': start,
'length': stop - start
}
if i == len(agp) - 2:
chr['length'] = stop
return chr
def fetch_ftp(ftp, file_name):
bytesio_object = io.BytesIO()
def handle_binary(data):
bytesio_object.write(data)
try:
ftp.retrbinary('RETR ' + file_name, callback=handle_binary)
except ftplib.error_temp as e:
# E.g. "ftplib.error_temp: 425 EPSV: Address already in use"
logger.warning('Caught FTP error; retrying in 1 second')
time.sleep(1)
ftp.retrbinary('RETR ' + file_name, callback=handle_binary)
return bytesio_object
# Downloads gzipped FTP data in binary format, returns plain text content
def fetch_gzipped_ftp(ftp, file_name):
bytesio_object = fetch_ftp(ftp, file_name)
bytesio_object.seek(0) # Go back to the start
zip_data = gzip.GzipFile(fileobj=bytesio_object)
content = zip_data.read().decode('utf-8')
return content
def change_ftp_dir(ftp, wd):
logger.info('Changing FTP working directory to: ' + wd)
try:
ftp.cwd(wd)
return 0
except ftplib.error_perm as e:
logger.warning(e)
return 1
# GRCh38 defines centromeres and heterochromatin in regions files, not AGP gaps
def download_genome_regions(ftp, regions_ftp):
centromeres = {}
wd = '/'.join(regions_ftp.split('/')[:-1])
change_ftp_dir(ftp, wd)
logger.info('Downloading genome regions ' + regions_ftp)
# Example:
# ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.37_GRCh38.p11/GCF_000001405.37_GRCh38.p11_assembly_regions.txt
content = fetch_ftp(ftp, regions_ftp).getvalue().decode('utf-8')
for line in content.split('\n'):
if len(line) == 0 or line[0] == '#':
continue
columns = line.split('\t')
role = columns[4]
if role == 'CEN':
chr = columns[1]
start = columns[2]
stop = columns[3]
centromeres[chr] = {
'start': int(start),
'length': int(stop) - int(start)
}
if len(centromeres) > 0:
logger.info('Found centromeres in regions for FTP path ' + regions_ftp)
return centromeres
def write_centromere_data(organism, asm_name, asm_acc, output_dir, chrs):
global manifest
logger.info(
'Centromeres found for ' + organism + ' ' +
'in genome assembly ' + asm_name + ' (' + asm_acc + ')'
)
leaf = ''
if (
(organism == 'homo-sapiens' and asm_name[:3] == 'GRC') or
(organism == 'mus-musculus' and asm_name[:3] in ('GRC', 'MGS')) or
(organism == 'rattus-norvegicus' and asm_name[:4] == 'Rnor')
):
logger.info('Got no-bands assembly: ' + asm_name)
leaf = '-no-bands'
output_path = output_dir + organism + leaf + '.json'
long_output_path = output_dir + organism + '-' + asm_acc + '.json'
adapted_chromosomes = []
max_chr_length = 0
for chr in chrs:
if chr['length'] > max_chr_length:
max_chr_length = chr['length']
for chr in chrs:
name = chr['name']
length = chr['length']
iscn_stop_q = str(round(length) / max_chr_length * 10000)
length = str(length)
if 'centromere' in chr:
cen = chr['centromere']
midpoint = cen['start'] + round(cen['length']/2)
iscn_stop_p = str(round(midpoint / max_chr_length * 10000))
midpoint = str(midpoint)
p = name + ' p 1 0 ' + iscn_stop_p + ' 0 ' + midpoint
q = (
name + ' q 1 ' + str(int(iscn_stop_p) + 1) + ' ' + iscn_stop_q +
' ' + midpoint + ' ' + length
)
adapted_chromosomes += [p, q]
else:
adapted_chromosomes.append(
name + ' n 1 0 ' + iscn_stop_q + ' 0 ' + length
)
adapted_chromosomes = {'chrBands': adapted_chromosomes}
js_chrs = json.dumps(adapted_chromosomes)
with open(output_path, 'w') as f:
f.write(js_chrs)
with open(long_output_path, 'w') as f:
f.write(js_chrs)
manifest[organism] = [asm_acc, asm_name]
def download_genome_agp(ftp, asm):
agp_ftp_wd = asm['agp_ftp_wd']
asm_acc = asm['acc']
organism = asm['organism']
asm_output_dir = asm['asm_output_dir']
asm_name = asm['name']
asm_segment = asm['asm_segment']
regions_ftp = asm['regions_ftp']
chrs = []
chrs_seen = {}
has_centromere_data = False
status = change_ftp_dir(ftp, agp_ftp_wd)
if status == 1:
return
file_names = ftp.nlst()
logger.info('List of files in FTP working directory')
logger.info(file_names)
for file_name in file_names:
# Download each chromomsome's compressed AGP file
# We retrieve both agp.gz and comp.agp.gz files
# Former is more common, latter used for some organisms (e.g. platypus)
# Example full URL of file:
# 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF_000001515.7_Pan_tro_3.0/GCF_000001515.7_Pan_tro_3.0_assembly_structure/Primary_Assembly/assembled_chromosomes/AGP/chr1.agp.gz'
logger.info(
'Retrieving from FTP (' + asm_name + ', ' + asm_acc + '): ' +
file_name
)
agp = fetch_gzipped_ftp(ftp, file_name)
chr = get_chromosome_object(agp)
chr_acc = chr['accession']
if chr_acc not in chrs_seen:
chr['name'] = file_name.split('.')[0].split('chr')[1]
chrs.append(chr)
chrs_seen[chr_acc] = 1
if "centromere" in str(agp):
has_centromere_data = True
orgs_with_centromere_data[organism] = 1
else:
chr_name = file_name.split(".")[0]
logger.info(
'No centromere data found in AGP for ' + organism + ' ' +
'genome assembly ' + asm_name + ' chromosome ' + chr_name
)
continue
if not has_centromere_data:
logger.info(
'No centromere data found in any AGP for ' + organism + ' ' +
'in genome assembly ' + asm_name + ' (' + asm_acc + ')'
)
if regions_ftp != '':
centromeres = download_genome_regions(ftp, regions_ftp)
if len(centromeres) > 0:
has_centromere_data = True
orgs_with_centromere_data[organism] = 1
for chr in centromeres:
for chr2 in chrs:
if chr == chr2['name']:
chr2['centromere'] = centromeres[chr]
if has_centromere_data:
write_centromere_data(organism, asm_name, asm_acc, output_dir, chrs)
def find_genomes_with_centromeres(ftp, asm_summary_response):
data = asm_summary_response
logger.info('numbers of keys in asm_summary_response:')
logger.info(len(data['result'].keys()))
for uid in data['result']:
# Omit list of UIDs
if uid == 'uids':
continue
result = data['result'][uid]
acc = result['assemblyaccession'] # Accession.version
name = result['assemblyname']
taxid = result['taxid']
organism = result['speciesname'].lower().replace(' ', '-').strip()
# one fully banded (downstream), one not
# if organism != 'homo-sapiens' and organism != 'pongo-abelii':
# continue
asm_segment = acc + '_' + name.replace(' ', '_').replace('-', '_')
# NCBI genomes FTP directories have path segments corresponding to a split
# assembly accession, e.g. GCF_000001515 -> GCF/000/001/515.
split_acc = ''
for i, char in enumerate(acc.split('.')[0].replace('_', '')):
split_acc += char
if (i + 1) % 3 == 0:
split_acc += '/'
# Example: ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/515/GCF_000001515.7_Pan_tro_3.0/GCF_000001515.7_Pan_tro_3.0_assembly_structure/Primary_Assembly/assembled_chromosomes/AGP/chr1.agp.gz
# FTP working directory of AGP files
agp_ftp_wd = (
'/genomes/all/' + split_acc +
asm_segment + '/' + asm_segment + '_assembly_structure/' +
'Primary_Assembly/assembled_chromosomes/AGP/'
)
regions_ftp = result['ftppath_regions_rpt']
if regions_ftp != '':
regions_ftp = regions_ftp.split('nih.gov')[1]
asm_output_dir = output_dir + organism + '/' + asm_segment + '/'
asm = {
'acc': acc,
'name': name,
'taxid': taxid,
'organism': organism,
'agp_ftp_wd': agp_ftp_wd,
'asm_output_dir': asm_output_dir,
'asm_segment': asm_segment,
'regions_ftp': regions_ftp
}
download_genome_agp(ftp, asm)
asms.append(asm)
def chunkify(lst, n):
return [lst[i::n] for i in range(n)]
def pool_processing(uid_list):
uid_list = ','.join(uid_list)
asm_summary = esummary + '&db=assembly&id=' + uid_list
logger.info('Fetching ' + asm_summary)
# Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?retmode=json&db=assembly&id=733711
with request.urlopen(asm_summary) as response:
data = json.loads(response.read().decode('utf-8'))
time.sleep(3) # Delay for 3 seconds
ftp = ftplib.FTP(ftp_domain)
ftp.login()
find_genomes_with_centromeres(ftp, data)
ftp.quit()
api_key = '&api_key=7e33ac6a08a6955ec3b83d214d22b21a2808'
eutils = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
esearch = eutils + 'esearch.fcgi?retmode=json' + api_key
esummary = eutils + 'esummary.fcgi?retmode=json' + api_key
elink = eutils + 'elink.fcgi?retmode=json' + api_key
asms = []
term = quote(
'("latest refseq"[filter]) AND '
'("chromosome level"[filter] OR "complete genome"[filter]) AND ' +
'(animals[filter] OR plants[filter] OR fungi[filter] OR protists[filter])'
)
asm_search = esearch + '&db=assembly&term=' + term + '&retmax=10000'
# Example: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmode=json&db=assembly&term=("latest refseq"[filter] AND "chromosome level"[filter]) AND (animals[filter] OR plants[filter] OR fungi[filter] OR protists[filter])&retmax=10000
with request.urlopen(asm_search) as response:
data = json.loads(response.read().decode('utf-8'))
# Returns ~1000 ids
top_uid_list = data['esearchresult']['idlist']
logger.info('Assembly UIDs returned in search results: ' + str(len(top_uid_list)))
non_ncbi_manifest = fetch_cytobands_from_dbs.main()
# TODO: Make this configurable
num_threads = 1
uid_lists = chunkify(top_uid_list, num_threads)
with ThreadPoolExecutor(max_workers=num_threads) as pool:
pool.map(pool_processing, uid_lists)
logger.info('manifest')
logger.info(manifest)
# logger.info('non_ncbi_manifest')
# logger.info(non_ncbi_manifest)
#non_ncbi_manifest.update(manifest)
#manifest = non_ncbi_manifest
# Write a manifest of organisms for which we have cytobands.
# This enables Ideogram.js to more quickly load those organisms.
pp = pprint.PrettyPrinter(indent=4)
manifest = pp.pformat(manifest)
manifest = "assemblyManifest = " + manifest
with open('../../src/js/assembly-manifest.js', 'w') as f:
f.write(manifest)
logger.info('Calling convert_band_data.py')
convert_band_data.main()
logger.info('Ending get_chromosomes.py')