Source code for pyrice.build_dictionary

# -*- coding: utf-8 -*-

from argparse import ArgumentParser
from pyrice.multi_query import MultiQuery
from pyrice import multi_query
import time
import pickle
import gzip
import shutil
import os
import wget

# parser = ArgumentParser("build_dictionary", conflict_handler='resolve')
# parser.add_argument("--rapdb_url", type=str, default="https://rapdb.dna.affrc.go.jp/download/archive/irgsp1/IRGSP-1.0_representative_annotation_2019-12-17.tsv.gz")
# parser.add_argument("--oryzabase_url",type=str, default="https://shigen.nig.ac.jp/rice/oryzabase/gene/download?classtag=GENE_EN_LIST")
# # args = parser.parse_args()

dir_path = os.path.dirname(multi_query.__file__)

[docs]def update_gene_dictionary(): """ Update function for gene dictionary """ test = MultiQuery() chromosome = ["chr01", "chr02", "chr03", "chr04", "chr05", "chr06", "chr07", "chr08", "chr09", "chr10", "chr11", "chr12"] start = ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"] end = ["43270923", "35937250", "36413819", "35502694", "29958434", "31248787", "29697621", "28443022", "23012720" , "23207287", "29021106", "27531856"] iric_dict = dict() for i in range(len(chromosome)): t = time.time() file_id = test.search_on_chromosome(chro=chromosome[i], start_pos=start[i], end_pos=end[i], number_process=4, dbs=["iric"]) iric_dict.update(file_id) print("Time for search gene on chromosome {} : {}".format(chromosome[i],time.time() - t)) id_dict = dict() loc_dict = dict() for iric_name in iric_dict.keys(): if iric_dict[iric_name]["msu7Name"] != None: for loc in iric_dict[iric_name]["msu7Name"]: if loc not in loc_dict.keys(): loc_dict.setdefault(loc, iric_name) if iric_dict[iric_name]["raprepName"] != None: for ids in iric_dict[iric_name]["raprepName"]: if ids not in id_dict.keys(): id_dict.setdefault(ids, iric_name) with open(os.path.join(dir_path,'support/iric_dict.pkl'), 'wb') as f: pickle.dump(iric_dict, f) f.close() with open(os.path.join(dir_path,'support/loc_dict.pkl'), 'wb') as f: pickle.dump(loc_dict, f) f.close() with open(os.path.join(dir_path,'support/id_dict.pkl'), 'wb') as f: pickle.dump(id_dict, f) f.close()
source_filepath = "support/rapdb.gz" dest_filepath = "support/rapdb.tsv" oryzabase_filepaths = ['support/oryzabase.txt','support/oryzabase_ref.txt']
[docs]def gunzip_shutil(source_filepath, dest_filepath, block_size=65536): """ Function to unzip file :param source_filepath: (str) source file path file .zip :param dest_filepath: (str) destination file path :param block_size: (int) """ with gzip.open(source_filepath, 'rb') as s_file, \ open(dest_filepath, 'wb') as d_file: shutil.copyfileobj(s_file, d_file, block_size)
[docs]def update_local_database(rapdb_url, oryzabase_url): """ Update function for rapdb database and oryzabase database :param rapdb_url: (str) url for download rapdb database :param oryzabase_url: (list) url for download oryzabase database (1st: url of genes, 2nd: url of refs) """ with open(os.path.join(dir_path,"support/id_dict.pkl"), "rb") as f: id_dict = pickle.load(f) f.close() ## Oryzabase print('Beginning Oryzabase database download with requests') for url,oryzabase_filepath in zip(oryzabase_url,oryzabase_filepaths): wget.download(url,os.path.join(dir_path,oryzabase_filepath)) print('Download successfully Oryzabase database') # Oryzabase refs with open(os.path.join(dir_path,oryzabase_filepaths[1]), 'rb') as f: data = f.readlines() f.close() filter_data = [] count = 0 for d in data: try: filter_data.append(d.decode('utf-8').split("\t")) except: count += 1 count = 0 id_2_pubmed = dict() for d in filter_data[1:]: # Check length pubmed id if len(d[1]) > 3: # Check case '_,_,_' tmp_d = d[8].replace(',', '') if len(d[8]) > 0 and d[8] != '\r\n' and d[8] != '-' and d[8] != '_' and not tmp_d == len(tmp_d) * tmp_d[0]: use_name = d[8] # Check if Gene Name Synonym elif len(d[9]) > 0 and d[9] != '\r\n' and d[9] != '-' and d[9] != '_': use_name = d[9] print(use_name) else: continue use_name = " ".join(use_name.split()) name = use_name.split(',') for n in name: # Remove space n = " ".join(n.split()) if len(n) > 0 and n != '_' and n != '-': article = {"PubMedId": d[1], "Author": d[2], "Title": d[3], "Journal": d[4], "Year": d[7]} if n not in id_2_pubmed.keys(): id_2_pubmed.setdefault(n, []) id_2_pubmed[n].append(article) else: id_2_pubmed[n].append(article) else: count += 1 # Oryzabase genes with open(os.path.join(dir_path,oryzabase_filepaths[0]), "r") as f: data = f.readlines() f.close() filter_data = [] for d in data: filter_data.append(d.split("\t")) oryzabase = dict() filter_data[0][-1] = filter_data[0][-1][:-1] for d in filter_data: if len(d) > 10: if d[10] in id_dict.keys(): iric_name = id_dict[d[10]] oryzabase.setdefault(iric_name, {"oryzabase": dict()}) for i in range(len(filter_data[0])): if d[i] != '\n' and len(d[i]) > 1: oryzabase[iric_name]["oryzabase"].setdefault(filter_data[0][i], d[i]) # Match CGSNL Gene Symbol with id if len(d[1]) > 0 and d[1] != '\r\n' and d[1] != '-' and d[1] != '_': use_name = d[1] # Match Gene symbol synonym(s) with id elif len(d[2]) > 0 and d[2] != '\r\n' and d[2] != '-' and d[2] != '_': use_name = d[2] else: continue # Remove space use_name = " ".join(use_name.split()) name = use_name.split(',') pubmed_exist = set() pubmed_final = [] for n in name: # Remove space n = " ".join(n.split()) # Check exist pubmed id if len(n) > 0 and n != '_' and n != '-' and n in id_2_pubmed.keys(): for pubmed_art in id_2_pubmed[n]: if pubmed_art['PubMedId'] not in pubmed_exist: pubmed_final.append(pubmed_art) oryzabase[iric_name]["oryzabase"].setdefault("Reference", pubmed_final) with open(os.path.join(dir_path,"support/oryzabase.pkl"), "wb") as f: pickle.dump(oryzabase, f) f.close() for oryzabase_filepath in oryzabase_filepaths: if (os.path.exists(oryzabase_filepath)): os.remove(oryzabase_filepath) print('Build successfully Oryzabase database') # Rapdb print('Beginning Rapdb database download with requests') wget.download(rapdb_url, os.path.join(dir_path,source_filepath)) gunzip_shutil(os.path.join(dir_path,source_filepath), os.path.join(dir_path,dest_filepath)) print('Download successfully Rapdb database') data = [] with open(os.path.join(dir_path,dest_filepath), "r") as f: data = f.readlines() f.close() filter_data = [] for d in data: filter_data.append(d.split("\t")) rapdb = dict() filter_data[0][-1] = filter_data[0][-1][:-1] for d in filter_data: if len(d) > 16: if d[1] in id_dict.keys(): iric_name = id_dict[d[1]] rapdb.setdefault(iric_name, {"rapdb": dict()}) for i in range(len(filter_data[0])): if d[i] != '\n' and len(d[i]) > 1: rapdb[iric_name]["rapdb"].setdefault(filter_data[0][i], d[i]) with open(os.path.join(dir_path,"support/rapdb.pkl"), "wb") as f: pickle.dump(rapdb, f) f.close() if (os.path.exists(source_filepath)): os.remove(source_filepath) if (os.path.exists(dest_filepath)): os.remove(dest_filepath) print('Build successfully Rapdb database')
if __name__ == '__main__': #update_gene_dictionary() update_local_database("https://rapdb.dna.affrc.go.jp/download/archive/irgsp1/IRGSP-1.0_representative_annotation_2021-05-10.tsv.gz", ["https://shigen.nig.ac.jp/rice/oryzabase/gene/download?classtag=GENE_EN_LIST","https://shigen.nig.ac.jp/rice/oryzabase/reference/download"])