1+ import os
2+ import pandas as pd
3+ import numpy as np
4+ import warnings
5+ warnings .filterwarnings ("ignore" )
6+ from collections import Counter
7+ from Bio .SeqUtils .ProtParam import ProteinAnalysis
8+ from Bio import SeqIO
9+ from sklearn import preprocessing
10+
11+ def file_name (file_dir ,gb_fas ):
12+ for root ,dirs ,files in os .walk (file_dir ):
13+ LL1 = []
14+ for ff in files :
15+ if os .path .splitext (ff )[1 ]== gb_fas :
16+ LL1 .append (os .path .join (ff ))
17+ return LL1
18+
19+ # Encode the AAC feature with the protein sequence
20+ def AAC_feature (fastas ):
21+ AA = 'ACDEFGHIKLMNPQRSTVWY*'
22+ encodings = []
23+ header = ['#' ]
24+ for i in AA :
25+ header .append (i )
26+ encodings .append (header )
27+ sequence = fastas
28+ count = Counter (sequence )
29+ for key in count :
30+ count [key ] = float (count [key ])/ len (sequence )
31+ code = []
32+ for aa in AA :
33+ code .append (count [aa ])
34+ encodings .append (code )
35+ return code
36+
37+ # Extract the physical-chemical properties from the protein sequnce
38+ def physical_chemical_feature (sequence ):
39+ seq_new = sequence .replace ('X' ,'' ).replace ('U' ,'' ).replace ('B' ,'' ).replace ('Z' ,'' ).replace ('J' ,'' )
40+ CE = 'CHONS'
41+ Chemi_stats = {'A' :{'C' : 3 , 'H' : 7 , 'O' : 2 , 'N' : 1 , 'S' : 0 },
42+ 'C' :{'C' : 3 , 'H' : 7 , 'O' : 2 , 'N' : 1 , 'S' : 1 },
43+ 'D' :{'C' : 4 , 'H' : 7 , 'O' : 4 , 'N' : 1 , 'S' : 0 },
44+ 'E' :{'C' : 5 , 'H' : 9 , 'O' : 4 , 'N' : 1 , 'S' : 0 },
45+ 'F' :{'C' : 9 , 'H' : 11 ,'O' : 2 , 'N' : 1 , 'S' : 0 },
46+ 'G' :{'C' : 2 , 'H' : 5 , 'O' : 2 , 'N' : 1 , 'S' : 0 },
47+ 'H' :{'C' : 6 , 'H' : 9 , 'O' : 2 , 'N' : 3 , 'S' : 0 },
48+ 'I' :{'C' : 6 , 'H' : 13 ,'O' : 2 , 'N' : 1 , 'S' : 0 },
49+ 'K' :{'C' : 6 , 'H' : 14 ,'O' : 2 , 'N' : 2 , 'S' : 0 },
50+ 'L' :{'C' : 6 , 'H' : 13 ,'O' : 2 , 'N' : 1 , 'S' : 0 },
51+ 'M' :{'C' : 5 , 'H' : 11 ,'O' : 2 , 'N' : 1 , 'S' : 1 },
52+ 'N' :{'C' : 4 , 'H' : 8 , 'O' : 3 , 'N' : 2 , 'S' : 0 },
53+ 'P' :{'C' : 5 , 'H' : 9 , 'O' : 2 , 'N' : 1 , 'S' : 0 },
54+ 'Q' :{'C' : 5 , 'H' : 10 ,'O' : 3 , 'N' : 2 , 'S' : 0 },
55+ 'R' :{'C' : 6 , 'H' : 14 ,'O' : 2 , 'N' : 4 , 'S' : 0 },
56+ 'S' :{'C' : 3 , 'H' : 7 , 'O' : 3 , 'N' : 1 , 'S' : 0 },
57+ 'T' :{'C' : 4 , 'H' : 9 , 'O' : 3 , 'N' : 1 , 'S' : 0 },
58+ 'V' :{'C' : 5 , 'H' : 11 ,'O' : 2 , 'N' : 1 , 'S' : 0 },
59+ 'W' :{'C' : 11 ,'H' : 12 ,'O' : 2 , 'N' : 2 , 'S' : 0 },
60+ 'Y' :{'C' : 9 , 'H' : 11 ,'O' : 3 , 'N' : 1 , 'S' : 0 }
61+ }
62+
63+ count = Counter (seq_new )
64+ code = []
65+
66+ for c in CE :
67+ abundance_c = 0
68+ for key in count :
69+ num_c = Chemi_stats [key ][c ]
70+ abundance_c += num_c * count [key ]
71+
72+ code .append (abundance_c )
73+ return (code )
74+
75+ # calculate the protein molecular for the protein sequnce.
76+ def molecular_weight (seq ):
77+ seq_new = seq .replace ('X' ,'' ).replace ('U' ,'' ).replace ('B' ,'' ).replace ('Z' ,'' ).replace ('J' ,'' )
78+ analysed_seq = ProteinAnalysis (seq_new )
79+ analysed_seq .monoisotopic = True
80+ mw = analysed_seq .molecular_weight ()
81+ return ([mw ])
82+
83+ if __name__ == '__main__' :
84+ inters = pd .read_csv ('../data/data_pos_neg.txt' ,header = None ,sep = '\t ' )
85+ phages = []
86+ hosts = []
87+ for i in inters .index :
88+ phages .append (inters .loc [i ,0 ])
89+ hosts .append (inters .loc [i ,1 ])
90+ phages = list (set (phages ))
91+ hosts = list (set (hosts ))
92+
93+ ####gb file is download from NCBI
94+ phage_features = []
95+ for pp in phages :
96+ print (pp )
97+ a = []
98+ filepath = '../data/phagegb/' + pp + '.gb'
99+ for record1 in SeqIO .parse (filepath ,"genbank" ):
100+ if (record1 .features ):
101+ myseq1 = record1 .seq
102+ for feature1 in record1 .features :
103+ if feature1 .type == "CDS" and 'translation' in feature1 .qualifiers :
104+ seq1 = feature1 .qualifiers ['translation' ][0 ]
105+ a .append (physical_chemical_feature (seq1 )+ molecular_weight (seq1 )+ AAC_feature (seq1 ))
106+ xx = np .array (a )
107+ phage_features .append (xx .mean (axis = 0 ).tolist ()+ xx .max (axis = 0 ).tolist ()+ xx .min (axis = 0 ).tolist ()+
108+ xx .std (axis = 0 ).tolist ()+ xx .var (axis = 0 ).tolist ()+ np .median (xx , axis = 0 ).tolist ())
109+ host_features = []
110+ for hh in hosts :
111+ print (hh )
112+ a = []
113+ filepath = '../data/hostgb/' + hh + '.gb'
114+ for record1 in SeqIO .parse (filepath ,"genbank" ):
115+ if (record1 .features ):
116+ myseq1 = record1 .seq
117+ for feature1 in record1 .features :
118+ if feature1 .type == "CDS" and 'translation' in feature1 .qualifiers :
119+ seq1 = feature1 .qualifiers ['translation' ][0 ]
120+ a .append (physical_chemical_feature (seq1 )+ molecular_weight (seq1 )+ AAC_feature (seq1 ))
121+ xx = np .array (a )
122+ host_features .append (xx .mean (axis = 0 ).tolist ()+ xx .max (axis = 0 ).tolist ()+ xx .min (axis = 0 ).tolist ()+
123+ xx .std (axis = 0 ).tolist ()+ xx .var (axis = 0 ).tolist ()+ np .median (xx , axis = 0 ).tolist ())
124+ ###save and normalize features
125+ min_max_scaler1 = preprocessing .MinMaxScaler ()
126+ phage_features_norm = min_max_scaler1 .fit_transform (phage_features )
127+ min_max_scaler2 = preprocessing .MinMaxScaler ()
128+ host_features_norm = min_max_scaler2 .fit_transform (host_features )
129+ for pp in range (len (phages )):
130+ np .savetxt ('../data/phage_protein_normfeatures/' + phages [pp ]+ '.txt' ,phage_features_norm [pp ,:])
131+ for hh in range (len (hosts )):
132+ np .savetxt ('../data/host_protein_normfeatures/' + hosts [hh ]+ '.txt' ,host_features_norm [hh ,:])
133+
134+
135+
136+
137+
138+
139+
140+
141+
142+
143+
0 commit comments