Skip to content

Commit 692aec2

Browse files
committed
add local hcp db module
1 parent 36c7628 commit 692aec2

File tree

2 files changed

+91
-62
lines changed

2 files changed

+91
-62
lines changed

physpetool/phylotree/retrieveprotein.py

Lines changed: 50 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
2727
2828
"""
29+
import shutil
2930
import glob
3031
import ftplib
3132
import os
@@ -100,7 +101,7 @@ def splist(l, s):
100101

101102

102103

103-
def retrieveprotein(proindexlist, outpath, matchlist, spelist):
104+
def retrieveprotein(proindexlist, outpath, matchlist, spelist, local_db):
104105
"""
105106
Retrieve proteins form Kegg DB
106107
:param proindexlist: a list contain protein index
@@ -115,33 +116,53 @@ def retrieveprotein(proindexlist, outpath, matchlist, spelist):
115116
if not os.path.exists(dirname):
116117
os.makedirs(dirname)
117118

118-
connect = ftplib.FTP("173.255.208.244")
119-
connect.login('anonymous')
120-
connect.cwd('/pub/databasehcp')
121-
p = 1
122-
# deal with none
123-
# connect.mkd("tem/")
124119
fasta = {}
120+
p = 1
125121

126-
for line in spelist:
127-
w_file = dirname + "/" + line + ".fa"
128-
fw_ = open(w_file, 'ab')
129-
retrievename = line + '.fasta'
130-
remoteFileName = 'RETR ' + os.path.basename(retrievename)
131-
connect.retrbinary(remoteFileName, fw_.write)
132-
fw_.write(b'\n')
133-
fw_.close()
134-
logretrieveprotein.info("Retrieve " + line + " highly conserved proteins completed")
135-
# read get sequences
136-
with open(w_file, 'r') as f:
137-
for line in f:
138-
if line != "\n":
139-
tem = line.strip()
140-
if tem[0] == '>':
141-
header = tem[1:]
142-
else:
143-
sequence = tem
144-
fasta[header] = fasta.get(header, '') + sequence
122+
# get hcp proteins form ftp server
123+
if local_db == "":
124+
connect = ftplib.FTP("173.255.208.244")
125+
connect.login('anonymous')
126+
connect.cwd('/pub/databasehcp')
127+
128+
for line in spelist:
129+
w_file = dirname + "/" + line + ".fa"
130+
fw_ = open(w_file, 'ab')
131+
retrievename = line + '.fasta'
132+
remoteFileName = 'RETR ' + os.path.basename(retrievename)
133+
connect.retrbinary(remoteFileName, fw_.write)
134+
fw_.write(b'\n')
135+
fw_.close()
136+
logretrieveprotein.info("Retrieve " + line + " highly conserved proteins completed")
137+
# read get sequences
138+
with open(w_file, 'r') as f:
139+
for line in f:
140+
if line != "\n":
141+
tem = line.strip()
142+
if tem[0] == '>':
143+
header = tem[1:]
144+
else:
145+
sequence = tem
146+
fasta[header] = fasta.get(header, '') + sequence
147+
# get protein sequence from local
148+
else:
149+
for line in spelist:
150+
file_name = line +".fasta"
151+
file_name_new = line + ".fa"
152+
abb_data_path = os.path.join(local_db,file_name)
153+
abb_data_path_new = os.path.join(dirname,file_name_new)
154+
shutil.copyfile(abb_data_path,abb_data_path_new)
155+
logretrieveprotein.info("Retrieve " + line + " highly conserved proteins completed")
156+
# read get sequences
157+
with open(abb_data_path_new, 'r') as f:
158+
for line in f:
159+
if line != "\n":
160+
tem = line.strip()
161+
if tem[0] == '>':
162+
header = tem[1:]
163+
else:
164+
sequence = tem
165+
fasta[header] = fasta.get(header, '') + sequence
145166

146167
for index in proindexlist:
147168
have_none = False
@@ -184,7 +205,7 @@ def retrieveprotein(proindexlist, outpath, matchlist, spelist):
184205
return dirname
185206

186207

187-
def doretrieve(specieslistfile, outpath):
208+
def doretrieve(specieslistfile, outpath,local_db):
188209
'''main to retrieve protein from kegg db'''
189210
# spelist = []
190211
# for line in specieslistfile:
@@ -194,7 +215,7 @@ def doretrieve(specieslistfile, outpath):
194215
logretrieveprotein.info("Reading organisms's names success!")
195216
colname = getcolname()
196217
proindexlist, matchlist = getspecies(spelist, colname)
197-
dirpath = retrieveprotein(proindexlist, outpath, matchlist, spelist)
218+
dirpath = retrieveprotein(proindexlist, outpath, matchlist, spelist,local_db)
198219
return dirpath
199220

200221

@@ -218,4 +239,4 @@ def hcp_name(index):
218239
# print("http://rest.kegg.jp/get/" + proid + "/aaseq")
219240
specieslistfile = ['zma', "ath", "eco"]
220241
outpath = "/home/yangfang/test/alg2/"
221-
doretrieve(specieslistfile, outpath)
242+
doretrieve(specieslistfile, outpath,local_db="")

physpetool/physpe/autobuild.py

Lines changed: 41 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
mafftpara = "--auto"
5656

5757
auto_build_log = getLogging('Used time')
58+
59+
5860
def start_args(input):
5961
"""
6062
Argument parse
@@ -92,6 +94,9 @@ def start_args(input):
9294
default=False,
9395
help="The esrna mode is use SSU RNA sequence with extend SSU RNA sequence \
9496
(users provide) to reconstruct phylogenetic tree. ")
97+
autobuild_args.add_argument('-db', action='store', dest="db",
98+
default="",
99+
help="The absolute path for local database.")
95100
advance_args.add_argument('--muscle', action='store_true', dest='muscle',
96101
default=True,
97102
help="Multiple sequence alignment by muscle. The default multiple sequence \
@@ -134,9 +139,9 @@ def starting(args):
134139
starting run reconstruct tree
135140
:param args: arguments
136141
"""
137-
print ("Loading organism's names success.....\n")
138-
print ("The result are store in:{0}\n".format(args.outdata))
139-
print ("Now loading data and constructing phylogenetic tree......")
142+
print("Loading organism's names success.....\n")
143+
print("The result are store in:{0}\n".format(args.outdata))
144+
print("Now loading data and constructing phylogenetic tree......")
140145

141146
in_put = args.spenames
142147

@@ -151,7 +156,6 @@ def starting(args):
151156
else:
152157
pass
153158

154-
155159
# Reconstruct phylogenetic tree by ssu RNA.
156160
if args.ssurna:
157161
setlogdir(out_put)
@@ -178,7 +182,8 @@ def starting(args):
178182
args.raxml, args.raxml_parameter,
179183
args.fasttree, args.fasttree_parameter,
180184
args.iqtree, args.iqtree_parameter,
181-
args.thread, args.extenddata)
185+
args.thread, args.extenddata,
186+
args.db)
182187

183188
# Reconstruct phylogenetic tree by extend ssu rna method.
184189
elif args.essurna:
@@ -197,15 +202,16 @@ def starting(args):
197202
elif args.HCP:
198203
setlogdir(out_put)
199204
starting_hcp(in_put, out_put,
200-
args.muscle, args.muscle_parameter,
201-
args.clustalw, args.clustalw_parameter,
202-
args.mafft, args.mafft_parameter,
203-
args.gblocks, args.gblocks_parameter,
204-
args.trimal, args.trimal_parameter,
205-
args.raxml, args.raxml_parameter,
206-
args.fasttree, args.fasttree_parameter,
207-
args.iqtree, args.iqtree_parameter,
208-
args.thread)
205+
args.muscle, args.muscle_parameter,
206+
args.clustalw, args.clustalw_parameter,
207+
args.mafft, args.mafft_parameter,
208+
args.gblocks, args.gblocks_parameter,
209+
args.trimal, args.trimal_parameter,
210+
args.raxml, args.raxml_parameter,
211+
args.fasttree, args.fasttree_parameter,
212+
args.iqtree, args.iqtree_parameter,
213+
args.thread, args.db)
214+
209215

210216
def starting_hcp(in_put, out_put,
211217
args_muscle, args_muscle_p,
@@ -215,25 +221,25 @@ def starting_hcp(in_put, out_put,
215221
args_trimal, args_trimal_p,
216222
args_raxml, args_raxml_p,
217223
args_fasttree, args_fasttree_p,
218-
args_iqtree,args_iqtree_p,
219-
args_thread):
224+
args_iqtree, args_iqtree_p,
225+
args_thread, args_db):
220226
'''reconstruct phylogenetic tree by hcp method'''
221227
hcp_input, recovery_dic = checkKeggOrganism(in_put)
222228
start = time.time()
223-
out_retrieve = doretrieve(hcp_input, out_put)
229+
out_retrieve = doretrieve(hcp_input, out_put,args_db)
224230
end = time.time()
225231
auto_build_log.info('Retrieving HCP sequences used time: {} Seconds'.format(end - start))
226232

227233
if not recovery_dic == {}:
228-
recovery(out_retrieve,recovery_dic)
234+
recovery(out_retrieve, recovery_dic)
229235
# set default aligned by muscle if not specify clustalw
230236
start2 = time.time()
231237
if args_clustalw:
232238
out_alg = doclustalw_file(out_retrieve, out_put, args_clustalw_p)
233239
elif args_mafft:
234-
out_alg = domafft_file(out_retrieve, out_put, args_mafft_p,args_thread)
240+
out_alg = domafft_file(out_retrieve, out_put, args_mafft_p, args_thread)
235241
elif args_muscle:
236-
out_alg = domuscle_file(out_retrieve, out_put, args_muscle_p,args_thread)
242+
out_alg = domuscle_file(out_retrieve, out_put, args_muscle_p, args_thread)
237243

238244
out_concat = cocat_path(out_alg)
239245

@@ -263,17 +269,16 @@ def starting_srna(in_put, out_put,
263269
args_trimal, args_trimal_p,
264270
args_raxml, args_raxml_p,
265271
args_fasttree, args_fasttree_p,
266-
args_iqtree,args_iqtree_p,
272+
args_iqtree, args_iqtree_p,
267273
args_thread):
268274
'''reconstruct phylogenetic tree by ssu rna method'''
269-
ssu_input,recovery_dic = checkSilvaOrganism(in_put)
275+
ssu_input, recovery_dic = checkSilvaOrganism(in_put)
270276
start = time.time()
271277
out_retrieve = retrieve16srna(ssu_input, out_put)
272278
end = time.time()
273279
auto_build_log.info('Retrieving SSU rRNA sequences used time: {} Seconds'.format(end - start))
274280
if not recovery_dic == []:
275-
recovery_silva(out_retrieve,recovery_dic,ssu_input)
276-
281+
recovery_silva(out_retrieve, recovery_dic, ssu_input)
277282

278283
# set default aligned by muscle if not specify clustalw or mafft
279284
start2 = time.time()
@@ -306,6 +311,7 @@ def starting_srna(in_put, out_put,
306311
end2 = time.time()
307312
auto_build_log.info('Constructing species tree used time: {} Seconds'.format(end2 - start2))
308313

314+
309315
def starting_ehcp(in_put, out_put,
310316
args_muscle, args_muscle_p,
311317
args_clustalw, args_clustalw_p,
@@ -314,17 +320,18 @@ def starting_ehcp(in_put, out_put,
314320
args_trimal, args_trimal_p,
315321
args_raxml, args_raxml_p,
316322
args_fasttree, args_fasttree_p,
317-
args_iqtree,args_iqtree_p,
318-
args_thread, args_extenddata):
323+
args_iqtree, args_iqtree_p,
324+
args_thread, args_extenddata,
325+
args_db):
319326
'''reconstruct phylogenetic tree by ehcp method'''
320327
hcp_input, recovery_dic = checkKeggOrganism(in_put)
321328
start = time.time()
322-
out_retrieve = doretrieve(hcp_input, out_put)
329+
out_retrieve = doretrieve(hcp_input, out_put,args_db)
323330
end = time.time()
324331
auto_build_log.info('Retrieving HCP sequences used time: {} Seconds'.format(end - start))
325332

326333
if not recovery_dic == {}:
327-
recovery(out_retrieve,recovery_dic)
334+
recovery(out_retrieve, recovery_dic)
328335
retrieve_pro = os.listdir(out_retrieve)
329336
for reline in retrieve_pro:
330337
fw_name = os.path.join(out_retrieve, reline)
@@ -341,7 +348,7 @@ def starting_ehcp(in_put, out_put,
341348
if args_clustalw:
342349
out_alg = doclustalw_file(out_retrieve, out_put, args_clustalw_p)
343350
elif args_mafft:
344-
out_alg = domafft_file(out_retrieve, out_put, args_mafft_p,args_thread)
351+
out_alg = domafft_file(out_retrieve, out_put, args_mafft_p, args_thread)
345352
elif args_muscle:
346353
out_alg = domuscle_file(out_retrieve, out_put, args_muscle_p, args_thread)
347354

@@ -363,6 +370,7 @@ def starting_ehcp(in_put, out_put,
363370
end2 = time.time()
364371
auto_build_log.info('Constructing species tree used time: {} Seconds'.format(end2 - start2))
365372

373+
366374
def starting_esrna(in_put, out_put,
367375
args_muscle, args_muscle_p,
368376
args_clustalw, args_clustalw_p,
@@ -371,17 +379,17 @@ def starting_esrna(in_put, out_put,
371379
args_trimal, args_trimal_p,
372380
args_raxml, args_raxml_p,
373381
args_fasttree, args_fasttree_p,
374-
args_iqtree,args_iqtree_p,
382+
args_iqtree, args_iqtree_p,
375383
args_thread, args_extenddata):
376384
'''reconstruct phylogenetic tree by ssu rna extend method'''
377385
extend_check = checkFile(args_extenddata)
378-
ssu_input,recovery_dic = checkSilvaOrganism(in_put)
386+
ssu_input, recovery_dic = checkSilvaOrganism(in_put)
379387
start = time.time()
380388
out_retrieve = retrieve16srna(ssu_input, out_put)
381389
end = time.time()
382390
auto_build_log.info('Retrieving SSU rRNA sequences used time: {} Seconds'.format(end - start))
383391
if not recovery_dic == {}:
384-
recovery(out_retrieve,recovery_dic)
392+
recovery(out_retrieve, recovery_dic)
385393
retrieve_srna_path = os.path.join(out_retrieve, 'rna_sequence.fasta')
386394

387395
fw = open(retrieve_srna_path, 'a')
@@ -418,4 +426,4 @@ def starting_esrna(in_put, out_put,
418426
args_raxml_p = raxmlpara_dna
419427
doraxml(out_f2p, out_put, args_raxml_p, args_thread)
420428
end2 = time.time()
421-
auto_build_log.info('Constructing species tree used time: {} Seconds'.format(end2 - start2))
429+
auto_build_log.info('Constructing species tree used time: {} Seconds'.format(end2 - start2))

0 commit comments

Comments
 (0)