UNPKG

ideogram

Version:

Chromosome visualization for the web

446 lines (363 loc) 14.4 kB
import json import re import os import urllib.request as request import gzip import argparse import shutil from collections import OrderedDict parser = argparse.ArgumentParser(description= "Analyze AncestryDNA raw data. Outputs plaintext genome analysis and " + "interactive genome-wide visualization of AncestryDNA genomic data\n\n" + "Example:\n" + "python3 analyze_ancestrydna.py --input ~/AncestryDNA.txt", formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument("--input", "-i", help="Input AncestryDNA.txt file", required=True) parser.add_argument("--snpedia", "-s", help="Show SNPpedia result. Default: true", type=bool, default=True) args = parser.parse_args() show_snpedia_results = args.snpedia data_dir = "../../data/analysis/" if os.path.exists(data_dir) == False: os.mkdir(data_dir) input_file = args.input output_file = data_dir + "genome_analysis.txt" # Raw sample data from AncestryDNA ancestrydna_sample = open(input_file).readlines() # Download ClinVar data if not already available date = '20170905' year = date[:4] leaf = 'clinvar_' + date + '.vcf' clinvar_vcf_path = data_dir + leaf if os.path.exists(clinvar_vcf_path) == False: url = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/archive_1.0/" + year + "/" + leaf + ".gz" with request.urlopen(url) as response: gzip_file = gzip.GzipFile(fileobj=response) with open(clinvar_vcf_path, "w") as f: for line in gzip_file: f.write(line.decode("utf-8")) with open(clinvar_vcf_path) as f: clinvar_vcf_file = f.readlines() # Download SNPedia data if not already available snpedia_json_path = data_dir + "snpedia-archive.json" if os.path.exists(snpedia_json_path) == False: url = "https://raw.githubusercontent.com/heiner/snpedia-23andme/master/snpedia-archive.json" with request.urlopen(url) as response: data = response.read() with open(snpedia_json_path, "w") as f: f.write(data.decode("utf-8")) with open(snpedia_json_path) as f: snpedia_json = json.loads(f.read()) output = [] rsids = {} bed = [] clinallele_re = re.compile("CLNALLE=(-?\d+)") disease_re = re.compile("CLNDBN=([^;]*)") clinsig_re = re.compile("CLNSIG=([^;]*)") clinrevstat_re = re.compile("CLNREVSTAT=([^;]*)") clinacc_re = re.compile("CLNACC=([^;]*)") gene_re = re.compile("GENEINFO=(\w+)") num_ancestrydna_rsids = 0 num_skipped_clinvars = 0 annots = [] clin_annots = [] allele_map = { "A": 0, "T": 1, "C": 2, "G": 3, "0": 4, "I": 4, # indel / insertion "D": 4 # indel / deletion } seen_chrs = {} seen_chrs_clin_annots = {} clinical_alleles = [] clinvar_url = "https://www.ncbi.nlm.nih.gov/clinvar/" def complement(nt): complements = { "A": "T", "T": "A", "C": "G", "G": "C" } return complements[nt] clinsig_labels = { 0: "Uncertain significance", 1: "Not provided", 2: "Benign", 3: "Likely benign", 4: "Likely pathogenic", 5: "Pathogenic", 6: "Drug response", 7: "Histocompatibility", 255: "Other" } rs_summaries = OrderedDict([ ("pathogenic", []), ("likely_pathogenic", []), ("drug_response", []) ]) if show_snpedia_results: rs_summaries["snpedia"] = [] def get_snpedia_comment(name, allele1, allele2): if name in snpedia_json and snpedia_json[name]: # SNPedia RS object, e.g. a1 = allele1 a2 = allele2 if a1 in (("I", "D", "0")) or a1 in (("I", "D", "0")): # Skip insertions, deletions, or unknown return [] sample_genotype = a1 + a2 srs = snpedia_json[name] if srs["original_orientation"] == "minus": sample_genotype = complement(a1) + complement(a2) if sample_genotype in srs["genotypes"]: sg = srs["genotypes"][sample_genotype] if sg["comment"].lower() in (( "", "common in clinvar", "common in complete genomics", "common on affy axiom data", "normal", "common", "?", "none", "normal risk", "average", "common/normal", "normal (orientation reversed)" )): # Skip uninformative entries return [] else: return sg["comment"] return [] def convert_to_bed(ancestry_dna_columns): ''' From https://www.biostars.org/p/153805/#153946: So here's what my original data from AncestryDNA would have looked like: rs4477212 1 82154 T T rs3131972 1 752721 G G rs12562034 1 768448 A G And here's what it would look like in bed detail format: chr1 82153 82154 rs4477212 0 + 82153 82154 0,0,255 1 1 0 rs4477212 TT chr1 752720 752721 rs3131972 0 + 752720 752721 0,0,255 1 1 0 rs3131972 GG chr1 768447 768448 rs12562034 0 + 768447 768448 0,0,255 1 1 0 rs12562034 AG ''' columns = ancestry_dna_columns bed_line = [ "chr" + columns[1], columns[2], str(int(columns[2]) + 1), columns[0], "0\t+", columns[2], str(int(columns[2]) + 1), "0,0,255\t1\t1\t0", columns[0], columns[3] + columns[4] ] bed_line = "\t".join(bed_line) return bed_line # Column headers of VCF file: # #CHROM POS ID REF ALT QUAL FILTER INFO # # Example line from body of VCF file: # 1 169519049 rs6025 T C . . RS=6025;RSPOS=169519049;RV;dbSNPBuildID=52;SSR=0;SAO=1;VP=0x050168000a0504053f130101;GENEINFO=F5:2153;WGT=1;VC=SNV;PM;PMC;SLO;NSM;REF;ASP;VLD;HD;GNO;KGPhase1;KGPhase3;LSD;MTP;OM;CLNALLE=0,1;CLNHGVS=NC_000001.10:g.169519049T\x3d,NC_000001.10:g.169519049T>C;CLNSRC=OMIM_Allelic_Variant,PharmGKB_Clinical_Annotation|PharmGKB;CLNORIGIN=1,1;CLNSRCID=612309.0001,1183689558|1183689558;CLNSIG=5|255|255|255|5,6;CLNDSDB=MedGen|.|.|MedGen:OMIM:SNOMED_CT|MedGen:OMIM:ORPHA:SNOMED_CT,MedGen;CLNDSDBID=C2674152|.|.|C0000809:614389:102878001|C0015499:227400:326:4320005,CN236515;CLNDBN=Thrombophilia_due_to_factor_V_Leiden|Ischemic_stroke\x2c_susceptibility_to|Budd-Chiari_syndrome\x2c_susceptibility_to|Recurrent_abortion|Factor_V_deficiency,hormonal_contraceptives_for_systemic_use_response_-_Toxicity/ADR;CLNREVSTAT=no_criteria|no_criteria|no_criteria|no_criteria|single,exp;CLNACC=RCV000000674.2|RCV000000675.3|RCV000000676.2|RCV000023935.2|RCV000205002.3,RCV000211384.1;CAF=0.00599,0.994;COMMON=1 # # See top of clinvar_vcf_file for description of inner INFO columns for line in clinvar_vcf_file: # Skip header lines if line[0] == "#": continue columns = line.strip().split("\t") rsid = columns[2] info = columns[7] clinallele_indexes = clinallele_re.search(info).group(1).split(",") diseases = disease_re.search(info).group(1).split(",") clinsigs = clinsig_re.search(info).group(1).split(",") clinrevstats = clinrevstat_re.search(info).group(1).split(",") clinaccs = clinacc_re.search(info).group(1).split(",") if clinallele_indexes[0] == "-1": num_skipped_clinvars += 1 ref = columns[3] # Reference allele, e.g. "A" alt = columns[4].split(",") # Alternate allele(s), e.g. ["T","C"] alleles = alt alleles.insert(0, ref) # Ref + alts, e.g. ["A", "T", "C"] gene_group = gene_re.search(info) if gene_group: gene = gene_group.group(1) else: gene = "" clinalleles = [] if len(clinallele_indexes) > 1: for i in clinallele_indexes: clinalleles.append(int(alleles[i])) else: clinalleles.append(alleles[int(clinallele_indexes[0])]) tmp = [] # Mapping cardinalities: # 1 RS ID : 1+ clinical alleles (one-to-many) # 1 allele : 1+ diseases (one-to-many) # 1 disease : 1 clinical significance (one-to-one) # In other words, each RS ID can have multiple alleles, and each allele # can be associated multiple one of more diseases, # each of which has one clinical significance for i, clinsig_list in enumerate(clinsigs): for j, clinsig in enumerate(clinsig_list.split("|")): disease = diseases[i].split("|")[j] disease = disease.replace("_", " ") # TODO: Properly decode non-Python-Unicode Unicode hex codes disease = disease.replace("\\x2c", ",") clinacc = clinaccs[i].split("|")[j] clinrevstat = clinrevstats[i].split("|")[j] tmp.append([int(clinsig), disease, clinrevstat, clinacc]) clinsigs = tmp rsids[rsid] = { "clinalleles": clinalleles, "clinsigs": clinsigs, "gene": gene } for line in ancestrydna_sample: if line[0] == "#" or line[:4] == "rsid": continue num_ancestrydna_rsids += 1 columns = line.strip().split("\t") bed_line = convert_to_bed(columns) bed.append(bed_line) name = columns[0] # rsid chr_index = int(columns[1]) chr = str(chr_index) # chromosome start = int(columns[2]) # position length = 1 # they're all single nucleotide variants allele1 = columns[3] allele2 = columns[4] if chr == "23": chr = "X" elif chr == "24": chr = "Y" elif chr == "25" or chr == "26": continue # TODO: mitochondrial DNA homozygous = 0 if (allele1 == allele2): homozygous = 1 if homozygous == 1: # Zygosity zygo = "homozygous" else: zygo = "heterozygous" genotype = name + "(" + allele1 + ";" + allele2 + ")" if name not in rsids: continue clinalleles = rsids[name]["clinalleles"] if show_snpedia_results: snpedia_comment = get_snpedia_comment(name, allele1, allele2) if len(snpedia_comment) > 0: # SNPedia seems noisier than ClinVar, also much overlap. rs_summaries["snpedia"].append( "SNPedia result for " + genotype + ":\n" + "\t" + snpedia_comment ) for i, clinallele in enumerate(clinalleles): if name in rsids and clinallele in set((allele1, allele2)): #output.append("clinical: " + name) # TODO: Hom vs. het clinsig cs_d_crs_ca = rsids[name]["clinsigs"][i] clinsig = cs_d_crs_ca[0] disease = cs_d_crs_ca[1] clinrevstat = cs_d_crs_ca[2] clinacc = cs_d_crs_ca[3] if clinsig > 3 and clinsig != 255: clinical_alleles.append( name + " " "chr" + chr + ":" + str(start) + " " + rsids[name]["gene"] ) cs_label = clinsig_labels[clinsig] rs_summary = ( "\n" + cs_label + ", " + zygo + ": " + genotype + "\n" + "\tDisease: " + disease + "\n" + "\tReview status: " + clinrevstat + "\n" + "\tClinVar record: " + clinvar_url + clinacc ) key = cs_label.lower().replace(" ", "_") rs_summaries[key].append(rs_summary) if clinsig in set((0,2,3,4,5)): track_index = clinsig - 1 # Simplify to "Pathogenic or likely pathogenic" or # "Benign or likely benign" if track_index in ((4, 3)): # Pathogenic or likely pathogenic track_index = 2 elif track_index == -1: # Uncertain significance track_index = 1 elif track_index in ((1, 2)): # Benign or likely benign track_index = 0 clin_annot = [name, start, length, track_index] if chr in seen_chrs_clin_annots: clin_annots[chr_index - 1]["annots"].append(clin_annot) else: clin_annots.append({"chr": chr, "annots": [clin_annot]}) seen_chrs_clin_annots[chr] = 1 else: clinsig = -1 # Not in ClinVar allele1 = allele_map[allele1] allele2 = allele_map[allele2] annot = [ name, start, length, homozygous, allele1, allele2, clinsig ] if chr in seen_chrs: annots[chr_index - 1]["annots"].append(annot) else: annots.append({"chr": chr, "annots": [annot]}) seen_chrs[chr] = 1 top_annots = {} top_annots["keys"] = [ "name", "start", "length", "homozygous", "allele1", "allele2", "clinsig" ] top_annots["annots"] = annots annots = json.dumps(top_annots) open(data_dir + "ancestrydna.json", "w").write(annots) top_annots = {} top_annots["keys"] = [ "name", "start", "length", "trackIndex" ] top_annots["annots"] = clin_annots annots = json.dumps(top_annots) open(data_dir + "ancestrydna-tracks.json", "w").write(annots) bed = "\n".join(bed) open(data_dir + "ancestrydna.bed", "w").write(bed) output.append("Number variants in AncestryDNA sample:") output.append(str(num_ancestrydna_rsids) + "\n") output.append("Number of variants in ClinVar analyzed:") output.append(str(len(rsids)) + "\n") output.append("Number of skipped clinical variants:") output.append(str(num_skipped_clinvars) + "\n") #for rs in clinical_alleles: # output.append(rs) s = rs_summaries output.append( "\nClinically significant variants in AncestryDNA sample:\n" + "\tPathogenic: " + str(len(s["pathogenic"])) + "\n" "\tLikely pathogenic: " + str(len(s["likely_pathogenic"])) + "\n" "\tDrug response: " + str(len(s["drug_response"])) + "\n" ) for key in rs_summaries: for summary in rs_summaries[key]: output.append(summary) output = "\n".join(output) open(output_file, "w").write(output) shutil.copy("../../examples/vanilla/ancestry.html", data_dir) shutil.copy("../../examples/vanilla/ancestry-tracks.html", data_dir) print( "\nAnalysis of AncestryDNA data in:\n" + "\t../data/analysis/genome_analysis.txt\n" + "\thttp://localhost/ideogram/data/analysis/ancestry.html\n" + "\thttp://localhost/ideogram/data/analysis/ancestry-tracks.html\n" )