Skip to content

Commit 00e2eca

Browse files
committed
update pre-build HCP db
1 parent 4b980b6 commit 00e2eca

File tree

2 files changed

+49
-25
lines changed

2 files changed

+49
-25
lines changed

physpetool/database/support_hcp_organism.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5659,4 +5659,4 @@ metz 1898379
56595659
ric 1182263
56605660
coh 2480923
56615661
sutt 2259134
5662-
amah 332411
5662+
amah 332411

physpetool/phylotree/retrieveprotein.py

Lines changed: 48 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
2727
2828
"""
29+
import glob
2930
import ftplib
3031
import os
3132
import sqlite3
@@ -37,6 +38,7 @@
3738
logretrieveprotein = getLogging('KEGG INDEX DB')
3839
KEGGDB = "KEGG_DB_3.0.db"
3940

41+
4042
def getspecies(name, colname):
4143
"""
4244
Get species protein index for DB
@@ -61,7 +63,7 @@ def getspecies(name, colname):
6163

6264
num_none = len([x for x in idslist if x == 'None'])
6365

64-
if num_none != len(idslist) :
66+
if num_none != len(idslist):
6567
relist.append(idslist)
6668
match_ko_name.append(ko)
6769
else:
@@ -88,7 +90,8 @@ def splist(l, s):
8890
return [l[i:i + s] for i in range(len(l)) if i % s == 0]
8991

9092

91-
def retrieveprotein(proindexlist, outpath, matchlist,spelist):
93+
94+
def retrieveprotein(proindexlist, outpath, matchlist, spelist):
9295
"""
9396
Retrieve proteins form Kegg DB
9497
:param proindexlist: a list contain protein index
@@ -103,14 +106,33 @@ def retrieveprotein(proindexlist, outpath, matchlist,spelist):
103106
if not os.path.exists(dirname):
104107
os.makedirs(dirname)
105108

106-
107109
connect = ftplib.FTP("173.255.208.244")
108110
connect.login('anonymous')
109111
connect.cwd('/pub/databasehcp')
110112
p = 1
111-
112113
# deal with none
113-
114+
# connect.mkd("tem/")
115+
fasta = {}
116+
117+
for line in spelist:
118+
w_file = dirname + "/" + line + ".fa"
119+
fw_ = open(w_file, 'ab')
120+
retrievename = line + '.fasta'
121+
remoteFileName = 'RETR ' + os.path.basename(retrievename)
122+
connect.retrbinary(remoteFileName, fw_.write)
123+
fw_.write(b'\n')
124+
fw_.close()
125+
logretrieveprotein.info("Retrieve " + line + " highly conserved proteins completely")
126+
# read get sequences
127+
with open(w_file, 'r') as f:
128+
for line in f:
129+
if line != "\n":
130+
tem = line.strip()
131+
if tem[0] == '>':
132+
header = tem[1:]
133+
else:
134+
sequence = tem
135+
fasta[header] = fasta.get(header, '') + sequence
114136

115137
for index in proindexlist:
116138
have_none = False
@@ -124,30 +146,32 @@ def retrieveprotein(proindexlist, outpath, matchlist,spelist):
124146
else:
125147
q_index = index
126148

127-
128149
hcp_pro_name = hcp_name(matchlist[p - 1])
129150

130151
wfiles = "{0}/p{1}.fasta".format(dirname, p)
131-
fw = open(wfiles, 'ab')
152+
fw = open(wfiles, 'a')
132153

133154
for id in q_index:
134-
not_get_abb = []
135-
retrievename = id + '.fasta'
136-
remoteFileName = 'RETR ' + os.path.basename(retrievename)
137-
connect.retrbinary(remoteFileName, fw.write)
138-
fw.write(b'\n')
139-
140-
if have_none:
141-
for line in app_spe:
142-
name_none = ">" + line +"\n"
143-
fw.write(name_none.encode())
144-
fw.write(b"M"+b"\n")
155+
abb_name = id.strip().split(":")[0]
156+
if id in fasta.keys():
157+
fw.write(">"+abb_name+"\n"+fasta[id]+"\n")
158+
else:
159+
name_none = ">" + abb_name + "\n"
160+
fw.write(name_none + "M" + "\n")
161+
if have_none:
162+
for line in app_spe:
163+
name_none = ">" + line + "\n"
164+
fw.write(name_none + "M" + "\n")
145165
fw.close()
146166

147167
logretrieveprotein.info(
148-
"Retrieve and download of highly conserved protein '{0}' was successful store in p{1}.fasta file".format(hcp_pro_name, str(p)))
168+
"Retrieve and download of highly conserved protein '{0}' was successful store in p{1}.fasta file".format(
169+
hcp_pro_name, str(p)))
149170
p += 1
150171
logretrieveprotein.info("Retrieve from KEGG database " + str(p - 1) + " highly conserved proteins")
172+
connect.quit()
173+
for infile in glob.glob(os.path.join(dirname, '*.fa')):
174+
os.remove(infile)
151175
return dirname
152176

153177

@@ -161,7 +185,7 @@ def doretrieve(specieslistfile, outpath):
161185
logretrieveprotein.info("Reading organisms's names success!")
162186
colname = getcolname()
163187
proindexlist, matchlist = getspecies(spelist, colname)
164-
dirpath = retrieveprotein(proindexlist, outpath, matchlist,spelist)
188+
dirpath = retrieveprotein(proindexlist, outpath, matchlist, spelist)
165189
return dirpath
166190

167191

@@ -177,12 +201,12 @@ def hcp_name(index):
177201

178202

179203
if __name__ == '__main__':
180-
print (getcolname())
181-
print (getspecies(['ath'], ['K01409']))
204+
print(getcolname())
205+
print(getspecies(['ath'], ['K01409']))
182206
# for line in getcolname():
183207
# if getspecies(['mdm'], [line])[0] != []:
184208
# proid = getspecies(['mdm'], [line])[0][0][0]
185209
# print("http://rest.kegg.jp/get/" + proid + "/aaseq")
186-
specieslistfile =['zma',"ath"]
210+
specieslistfile = ['zma', "ath", "eco"]
187211
outpath = "/home/yangfang/test/alg2/"
188-
doretrieve(specieslistfile, outpath)
212+
doretrieve(specieslistfile, outpath)

0 commit comments

Comments
 (0)