2626
2727
2828"""
29+ import glob
2930import ftplib
3031import os
3132import sqlite3
3738logretrieveprotein = getLogging ('KEGG INDEX DB' )
3839KEGGDB = "KEGG_DB_3.0.db"
3940
41+
4042def getspecies (name , colname ):
4143 """
4244 Get species protein index for DB
@@ -61,7 +63,7 @@ def getspecies(name, colname):
6163
6264 num_none = len ([x for x in idslist if x == 'None' ])
6365
64- if num_none != len (idslist ) :
66+ if num_none != len (idslist ):
6567 relist .append (idslist )
6668 match_ko_name .append (ko )
6769 else :
@@ -88,7 +90,8 @@ def splist(l, s):
8890 return [l [i :i + s ] for i in range (len (l )) if i % s == 0 ]
8991
9092
91- def retrieveprotein (proindexlist , outpath , matchlist ,spelist ):
93+
94+ def retrieveprotein (proindexlist , outpath , matchlist , spelist ):
9295 """
9396 Retrieve proteins form Kegg DB
9497 :param proindexlist: a list contain protein index
@@ -103,14 +106,33 @@ def retrieveprotein(proindexlist, outpath, matchlist,spelist):
103106 if not os .path .exists (dirname ):
104107 os .makedirs (dirname )
105108
106-
107109 connect = ftplib .FTP ("173.255.208.244" )
108110 connect .login ('anonymous' )
109111 connect .cwd ('/pub/databasehcp' )
110112 p = 1
111-
112113 # deal with none
113-
114+ # connect.mkd("tem/")
115+ fasta = {}
116+
117+ for line in spelist :
118+ w_file = dirname + "/" + line + ".fa"
119+ fw_ = open (w_file , 'ab' )
120+ retrievename = line + '.fasta'
121+ remoteFileName = 'RETR ' + os .path .basename (retrievename )
122+ connect .retrbinary (remoteFileName , fw_ .write )
123+ fw_ .write (b'\n ' )
124+ fw_ .close ()
125+ logretrieveprotein .info ("Retrieve " + line + " highly conserved proteins completely" )
126+ # read get sequences
127+ with open (w_file , 'r' ) as f :
128+ for line in f :
129+ if line != "\n " :
130+ tem = line .strip ()
131+ if tem [0 ] == '>' :
132+ header = tem [1 :]
133+ else :
134+ sequence = tem
135+ fasta [header ] = fasta .get (header , '' ) + sequence
114136
115137 for index in proindexlist :
116138 have_none = False
@@ -124,30 +146,32 @@ def retrieveprotein(proindexlist, outpath, matchlist,spelist):
124146 else :
125147 q_index = index
126148
127-
128149 hcp_pro_name = hcp_name (matchlist [p - 1 ])
129150
130151 wfiles = "{0}/p{1}.fasta" .format (dirname , p )
131- fw = open (wfiles , 'ab ' )
152+ fw = open (wfiles , 'a ' )
132153
133154 for id in q_index :
134- not_get_abb = []
135- retrievename = id + '.fasta'
136- remoteFileName = 'RETR ' + os .path .basename (retrievename )
137- connect .retrbinary (remoteFileName , fw .write )
138- fw .write (b'\n ' )
139-
140- if have_none :
141- for line in app_spe :
142- name_none = ">" + line + "\n "
143- fw .write (name_none .encode ())
144- fw .write (b"M" + b"\n " )
155+ abb_name = id .strip ().split (":" )[0 ]
156+ if id in fasta .keys ():
157+ fw .write (">" + abb_name + "\n " + fasta [id ]+ "\n " )
158+ else :
159+ name_none = ">" + abb_name + "\n "
160+ fw .write (name_none + "M" + "\n " )
161+ if have_none :
162+ for line in app_spe :
163+ name_none = ">" + line + "\n "
164+ fw .write (name_none + "M" + "\n " )
145165 fw .close ()
146166
147167 logretrieveprotein .info (
148- "Retrieve and download of highly conserved protein '{0}' was successful store in p{1}.fasta file" .format (hcp_pro_name , str (p )))
168+ "Retrieve and download of highly conserved protein '{0}' was successful store in p{1}.fasta file" .format (
169+ hcp_pro_name , str (p )))
149170 p += 1
150171 logretrieveprotein .info ("Retrieve from KEGG database " + str (p - 1 ) + " highly conserved proteins" )
172+ connect .quit ()
173+ for infile in glob .glob (os .path .join (dirname , '*.fa' )):
174+ os .remove (infile )
151175 return dirname
152176
153177
@@ -161,7 +185,7 @@ def doretrieve(specieslistfile, outpath):
161185 logretrieveprotein .info ("Reading organisms's names success!" )
162186 colname = getcolname ()
163187 proindexlist , matchlist = getspecies (spelist , colname )
164- dirpath = retrieveprotein (proindexlist , outpath , matchlist ,spelist )
188+ dirpath = retrieveprotein (proindexlist , outpath , matchlist , spelist )
165189 return dirpath
166190
167191
@@ -177,12 +201,12 @@ def hcp_name(index):
177201
178202
179203if __name__ == '__main__' :
180- print (getcolname ())
181- print (getspecies (['ath' ], ['K01409' ]))
204+ print (getcolname ())
205+ print (getspecies (['ath' ], ['K01409' ]))
182206 # for line in getcolname():
183207 # if getspecies(['mdm'], [line])[0] != []:
184208 # proid = getspecies(['mdm'], [line])[0][0][0]
185209 # print("http://rest.kegg.jp/get/" + proid + "/aaseq")
186- specieslistfile = ['zma' ,"ath" ]
210+ specieslistfile = ['zma' , "ath" , "eco " ]
187211 outpath = "/home/yangfang/test/alg2/"
188- doretrieve (specieslistfile , outpath )
212+ doretrieve (specieslistfile , outpath )
0 commit comments