Source code for pyrice.build_dictionary

# -*- coding: utf-8 -*-

from argparse import ArgumentParser
from pyrice.multi_query import MultiQuery
from pyrice import multi_query
import time
import pickle
import gzip
import shutil
import os
import wget

# parser = ArgumentParser("build_dictionary", conflict_handler='resolve')
# parser.add_argument("--rapdb_url", type=str, default="https://rapdb.dna.affrc.go.jp/download/archive/irgsp1/IRGSP-1.0_representative_annotation_2019-12-17.tsv.gz")
# parser.add_argument("--oryzabase_url",type=str, default="https://shigen.nig.ac.jp/rice/oryzabase/gene/download?classtag=GENE_EN_LIST")
# # args = parser.parse_args()

dir_path = os.path.dirname(multi_query.__file__)

[docs]def update_gene_dictionary():
    """
    Update function for gene dictionary

    """
    test = MultiQuery()
    chromosome = ["chr01", "chr02", "chr03", "chr04", "chr05", "chr06", "chr07", "chr08", "chr09", "chr10", "chr11",
                  "chr12"]
    start = ["1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1", "1"]
    end = ["43270923", "35937250", "36413819", "35502694", "29958434", "31248787", "29697621", "28443022", "23012720"
        , "23207287", "29021106", "27531856"]
    iric_dict = dict()
    for i in range(len(chromosome)):
        t = time.time()
        file_id = test.search_on_chromosome(chro=chromosome[i], start_pos=start[i],
                                   end_pos=end[i], number_process=4, dbs=["iric"])
        iric_dict.update(file_id)
        print("Time for search gene on chromosome {} : {}".format(chromosome[i],time.time() - t))
    id_dict = dict()
    loc_dict = dict()
    for iric_name in iric_dict.keys():
        if iric_dict[iric_name]["msu7Name"] != None:
            for loc in iric_dict[iric_name]["msu7Name"]:
                if loc not in loc_dict.keys():
                    loc_dict.setdefault(loc, iric_name)
        if iric_dict[iric_name]["raprepName"] != None:
            for ids in iric_dict[iric_name]["raprepName"]:
                if ids not in id_dict.keys():
                    id_dict.setdefault(ids, iric_name)
    with open(os.path.join(dir_path,'support/iric_dict.pkl'), 'wb') as f:
        pickle.dump(iric_dict, f)
        f.close()
    with open(os.path.join(dir_path,'support/loc_dict.pkl'), 'wb') as f:
        pickle.dump(loc_dict, f)
        f.close()
    with open(os.path.join(dir_path,'support/id_dict.pkl'), 'wb') as f:
        pickle.dump(id_dict, f)
        f.close()


source_filepath = "support/rapdb.gz"
dest_filepath = "support/rapdb.tsv"
oryzabase_filepaths = ['support/oryzabase.txt','support/oryzabase_ref.txt']

[docs]def gunzip_shutil(source_filepath, dest_filepath, block_size=65536):
    """
    Function to unzip file

    :param source_filepath: (str) source file path file .zip
    :param dest_filepath: (str) destination file path
    :param block_size: (int)

    """
    with gzip.open(source_filepath, 'rb') as s_file, \
            open(dest_filepath, 'wb') as d_file:
        shutil.copyfileobj(s_file, d_file, block_size)

[docs]def update_local_database(rapdb_url, oryzabase_url):
    """
    Update function for rapdb database and oryzabase database

    :param rapdb_url: (str) url for download rapdb database
    :param oryzabase_url: (list) url for download oryzabase database (1st: url of genes, 2nd: url of refs)

    """
    with open(os.path.join(dir_path,"support/id_dict.pkl"), "rb") as f:
        id_dict = pickle.load(f)
    f.close()
    ## Oryzabase
    print('Beginning Oryzabase database download with requests')
    for url,oryzabase_filepath in zip(oryzabase_url,oryzabase_filepaths):
        wget.download(url,os.path.join(dir_path,oryzabase_filepath))
    print('Download successfully Oryzabase database')

    # Oryzabase refs
    with open(os.path.join(dir_path,oryzabase_filepaths[1]), 'rb') as f:
        data = f.readlines()
    f.close()
    filter_data = []
    count = 0
    for d in data:
        try:
            filter_data.append(d.decode('utf-8').split("\t"))
        except:
            count += 1

    count = 0
    id_2_pubmed = dict()
    for d in filter_data[1:]:
        # Check length pubmed id
        if len(d[1]) > 3:
            # Check case '_,_,_'
            tmp_d = d[8].replace(',', '')
            if len(d[8]) > 0 and d[8] != '\r\n' and d[8] != '-' and d[8] != '_' and not tmp_d == len(tmp_d) * tmp_d[0]:
                use_name = d[8]
            # Check if Gene Name Synonym
            elif len(d[9]) > 0 and d[9] != '\r\n' and d[9] != '-' and d[9] != '_':
                use_name = d[9]
                print(use_name)
            else:
                continue
            use_name = " ".join(use_name.split())
            name = use_name.split(',')
            for n in name:
                # Remove space
                n = " ".join(n.split())
                if len(n) > 0 and n != '_' and n != '-':
                    article = {"PubMedId": d[1],
                               "Author": d[2],
                               "Title": d[3],
                               "Journal": d[4],
                               "Year": d[7]}
                    if n not in id_2_pubmed.keys():
                        id_2_pubmed.setdefault(n, [])
                        id_2_pubmed[n].append(article)
                    else:
                        id_2_pubmed[n].append(article)
        else:
            count += 1
    # Oryzabase genes
    with open(os.path.join(dir_path,oryzabase_filepaths[0]), "r") as f:
        data = f.readlines()
    f.close()
    filter_data = []
    for d in data:
        filter_data.append(d.split("\t"))
    oryzabase = dict()
    filter_data[0][-1] = filter_data[0][-1][:-1]
    for d in filter_data:
        if len(d) > 10:
            if d[10] in id_dict.keys():
                iric_name = id_dict[d[10]]
                oryzabase.setdefault(iric_name, {"oryzabase": dict()})
                for i in range(len(filter_data[0])):
                    if d[i] != '\n' and len(d[i]) > 1:
                        oryzabase[iric_name]["oryzabase"].setdefault(filter_data[0][i], d[i])

                # Match CGSNL Gene Symbol with id
                if len(d[1]) > 0 and d[1] != '\r\n' and d[1] != '-' and d[1] != '_':
                    use_name = d[1]
                    # Match Gene symbol synonym(s) with id
                elif len(d[2]) > 0 and d[2] != '\r\n' and d[2] != '-' and d[2] != '_':
                    use_name = d[2]
                else:
                    continue
                # Remove space
                use_name = " ".join(use_name.split())
                name = use_name.split(',')
                pubmed_exist = set()
                pubmed_final = []
                for n in name:
                    # Remove space
                    n = " ".join(n.split())
                    # Check exist pubmed id
                    if len(n) > 0 and n != '_' and n != '-' and n in id_2_pubmed.keys():
                        for pubmed_art in id_2_pubmed[n]:
                            if pubmed_art['PubMedId'] not in pubmed_exist:
                                pubmed_final.append(pubmed_art)
                oryzabase[iric_name]["oryzabase"].setdefault("Reference", pubmed_final)
    with open(os.path.join(dir_path,"support/oryzabase.pkl"), "wb") as f:
        pickle.dump(oryzabase, f)
    f.close()
    for oryzabase_filepath in oryzabase_filepaths:
        if (os.path.exists(oryzabase_filepath)):
            os.remove(oryzabase_filepath)
    print('Build successfully Oryzabase database')

    # Rapdb
    print('Beginning Rapdb database download with requests')
    wget.download(rapdb_url, os.path.join(dir_path,source_filepath))
    gunzip_shutil(os.path.join(dir_path,source_filepath), os.path.join(dir_path,dest_filepath))
    print('Download successfully Rapdb database')
    data = []
    with open(os.path.join(dir_path,dest_filepath), "r") as f:
        data = f.readlines()
        f.close()
    filter_data = []
    for d in data:
        filter_data.append(d.split("\t"))
    rapdb = dict()
    filter_data[0][-1] = filter_data[0][-1][:-1]
    for d in filter_data:
        if len(d) > 16:
            if d[1] in id_dict.keys():
                iric_name = id_dict[d[1]]
                rapdb.setdefault(iric_name, {"rapdb": dict()})
                for i in range(len(filter_data[0])):
                    if d[i] != '\n' and len(d[i]) > 1:
                        rapdb[iric_name]["rapdb"].setdefault(filter_data[0][i], d[i])
    with open(os.path.join(dir_path,"support/rapdb.pkl"), "wb") as f:
        pickle.dump(rapdb, f)
    f.close()
    if (os.path.exists(source_filepath)):
        os.remove(source_filepath)
    if (os.path.exists(dest_filepath)):
        os.remove(dest_filepath)
    print('Build successfully Rapdb database')

if __name__ == '__main__':
    #update_gene_dictionary()
    update_local_database("https://rapdb.dna.affrc.go.jp/download/archive/irgsp1/IRGSP-1.0_representative_annotation_2021-05-10.tsv.gz",
                          ["https://shigen.nig.ac.jp/rice/oryzabase/gene/download?classtag=GENE_EN_LIST","https://shigen.nig.ac.jp/rice/oryzabase/reference/download"])