Skip to content

Commit 45c2282

Browse files
committed
first commit
0 parents  commit 45c2282

File tree

1,089 files changed

+271881
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,089 files changed

+271881
-0
lines changed
1.44 KB
Binary file not shown.
1.46 KB
Binary file not shown.
2.1 KB
Binary file not shown.

code/compute_dna_features.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import numpy as np
2+
import os
3+
import warnings
4+
warnings.filterwarnings("ignore")
5+
import pandas as pd
6+
from sklearn import preprocessing
7+
8+
def file_name(file_dir,gb_fas):
9+
for root,dirs,files in os.walk(file_dir):
10+
LL1=[]
11+
for ff in files:
12+
if os.path.splitext(ff)[1]==gb_fas:
13+
LL1.append(os.path.join(ff))
14+
return LL1
15+
###using iLearn tool to compute features
16+
for mm in ['phage','host']:
17+
for method in ['Kmer','RCKmer','NAC','DNC','TNC','CKSNAP','PseEIIP']:
18+
print(mm,method)
19+
os.system('python ../iLearn-master/iLearn-nucleotide-basic.py --file all%s_dna_seq.fasta --method %s --format csv --out all%s_seq_%s.csv'%(mm,method,method,mm))
20+
21+
###combine features
22+
phage1=np.loadtxt('../data/allKmer_seq_phage.csv',delimiter=',')[:,1:]
23+
host1=np.loadtxt('../data/allKmer_seq_host.csv',delimiter=',')[:,1:]
24+
print(phage1.shape[1])
25+
for method in ['RCKmer','NAC','DNC','TNC','CKSNAP','PseEIIP']:
26+
phage2=np.loadtxt('../data/all'+method+'_seq_phage.csv',delimiter=',')[:,1:]
27+
host2=np.loadtxt('../data/all'+method+'_seq_host.csv',delimiter=',')[:,1:]
28+
phage1=np.hstack((phage1,phage2))
29+
host1=np.hstack((host1,host2))
30+
31+
###save and normalize features
32+
inters=pd.read_csv('../data/data_pos_neg.txt',header=None,sep='\t')
33+
phages=[]
34+
hosts=[]
35+
for i in inters.index:
36+
phages.append(inters.loc[i,0])
37+
hosts.append(inters.loc[i,1])
38+
phages=list(set(phages))
39+
hosts=list(set(hosts))
40+
41+
min_max_scaler1 = preprocessing.MinMaxScaler()
42+
phage_features_norm = min_max_scaler1.fit_transform(phage1)
43+
min_max_scaler2 = preprocessing.MinMaxScaler()
44+
host_features_norm = min_max_scaler2.fit_transform(host1)
45+
for pp in range(len(phages)):
46+
np.savetxt('../data/phage_dna_norm_features/'+phages[pp]+'.txt',phage_features_norm[pp,:])
47+
for hh in range(len(hosts)):
48+
np.savetxt('../data/host_dna_norm_features/'+hosts[hh]+'.txt',host_features_norm[hh,:])
49+

code/compute_protein_features.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import os
2+
import pandas as pd
3+
import numpy as np
4+
import warnings
5+
warnings.filterwarnings("ignore")
6+
from collections import Counter
7+
from Bio.SeqUtils.ProtParam import ProteinAnalysis
8+
from Bio import SeqIO
9+
from sklearn import preprocessing
10+
11+
def file_name(file_dir,gb_fas):
12+
for root,dirs,files in os.walk(file_dir):
13+
LL1=[]
14+
for ff in files:
15+
if os.path.splitext(ff)[1]==gb_fas:
16+
LL1.append(os.path.join(ff))
17+
return LL1
18+
19+
# Encode the AAC feature with the protein sequence
20+
def AAC_feature(fastas):
21+
AA = 'ACDEFGHIKLMNPQRSTVWY*'
22+
encodings = []
23+
header = ['#']
24+
for i in AA:
25+
header.append(i)
26+
encodings.append(header)
27+
sequence = fastas
28+
count = Counter(sequence)
29+
for key in count:
30+
count[key] = float(count[key])/len(sequence)
31+
code = []
32+
for aa in AA:
33+
code.append(count[aa])
34+
encodings.append(code)
35+
return code
36+
37+
# Extract the physical-chemical properties from the protein sequnce
38+
def physical_chemical_feature(sequence):
39+
seq_new=sequence.replace('X','').replace('U','').replace('B','').replace('Z','').replace('J','')
40+
CE = 'CHONS'
41+
Chemi_stats = {'A':{'C': 3, 'H': 7, 'O': 2, 'N': 1, 'S': 0},
42+
'C':{'C': 3, 'H': 7, 'O': 2, 'N': 1, 'S': 1},
43+
'D':{'C': 4, 'H': 7, 'O': 4, 'N': 1, 'S': 0},
44+
'E':{'C': 5, 'H': 9, 'O': 4, 'N': 1, 'S': 0},
45+
'F':{'C': 9, 'H': 11,'O': 2, 'N': 1, 'S': 0},
46+
'G':{'C': 2, 'H': 5, 'O': 2, 'N': 1, 'S': 0},
47+
'H':{'C': 6, 'H': 9, 'O': 2, 'N': 3, 'S': 0},
48+
'I':{'C': 6, 'H': 13,'O': 2, 'N': 1, 'S': 0},
49+
'K':{'C': 6, 'H': 14,'O': 2, 'N': 2, 'S': 0},
50+
'L':{'C': 6, 'H': 13,'O': 2, 'N': 1, 'S': 0},
51+
'M':{'C': 5, 'H': 11,'O': 2, 'N': 1, 'S': 1},
52+
'N':{'C': 4, 'H': 8, 'O': 3, 'N': 2, 'S': 0},
53+
'P':{'C': 5, 'H': 9, 'O': 2, 'N': 1, 'S': 0},
54+
'Q':{'C': 5, 'H': 10,'O': 3, 'N': 2, 'S': 0},
55+
'R':{'C': 6, 'H': 14,'O': 2, 'N': 4, 'S': 0},
56+
'S':{'C': 3, 'H': 7, 'O': 3, 'N': 1, 'S': 0},
57+
'T':{'C': 4, 'H': 9, 'O': 3, 'N': 1, 'S': 0},
58+
'V':{'C': 5, 'H': 11,'O': 2, 'N': 1, 'S': 0},
59+
'W':{'C': 11,'H': 12,'O': 2, 'N': 2, 'S': 0},
60+
'Y':{'C': 9, 'H': 11,'O': 3, 'N': 1, 'S': 0}
61+
}
62+
63+
count = Counter(seq_new)
64+
code = []
65+
66+
for c in CE:
67+
abundance_c = 0
68+
for key in count:
69+
num_c = Chemi_stats[key][c]
70+
abundance_c += num_c * count[key]
71+
72+
code.append(abundance_c)
73+
return(code)
74+
75+
# calculate the protein molecular for the protein sequnce.
76+
def molecular_weight(seq):
77+
seq_new=seq.replace('X','').replace('U','').replace('B','').replace('Z','').replace('J','')
78+
analysed_seq = ProteinAnalysis(seq_new)
79+
analysed_seq.monoisotopic = True
80+
mw = analysed_seq.molecular_weight()
81+
return([mw])
82+
83+
if __name__ == '__main__':
84+
inters=pd.read_csv('../data/data_pos_neg.txt',header=None,sep='\t')
85+
phages=[]
86+
hosts=[]
87+
for i in inters.index:
88+
phages.append(inters.loc[i,0])
89+
hosts.append(inters.loc[i,1])
90+
phages=list(set(phages))
91+
hosts=list(set(hosts))
92+
93+
####gb file is download from NCBI
94+
phage_features=[]
95+
for pp in phages:
96+
print(pp)
97+
a=[]
98+
filepath='../data/phagegb/'+pp+'.gb'
99+
for record1 in SeqIO.parse(filepath,"genbank"):
100+
if(record1.features):
101+
myseq1=record1.seq
102+
for feature1 in record1.features:
103+
if feature1.type=="CDS" and 'translation' in feature1.qualifiers:
104+
seq1=feature1.qualifiers['translation'][0]
105+
a.append(physical_chemical_feature(seq1)+molecular_weight(seq1)+AAC_feature(seq1))
106+
xx=np.array(a)
107+
phage_features.append(xx.mean(axis=0).tolist()+xx.max(axis=0).tolist()+xx.min(axis=0).tolist()+
108+
xx.std(axis=0).tolist()+xx.var(axis=0).tolist()+np.median(xx, axis=0).tolist())
109+
host_features=[]
110+
for hh in hosts:
111+
print(hh)
112+
a=[]
113+
filepath='../data/hostgb/'+hh+'.gb'
114+
for record1 in SeqIO.parse(filepath,"genbank"):
115+
if(record1.features):
116+
myseq1=record1.seq
117+
for feature1 in record1.features:
118+
if feature1.type=="CDS" and 'translation' in feature1.qualifiers:
119+
seq1=feature1.qualifiers['translation'][0]
120+
a.append(physical_chemical_feature(seq1)+molecular_weight(seq1)+AAC_feature(seq1))
121+
xx=np.array(a)
122+
host_features.append(xx.mean(axis=0).tolist()+xx.max(axis=0).tolist()+xx.min(axis=0).tolist()+
123+
xx.std(axis=0).tolist()+xx.var(axis=0).tolist()+np.median(xx, axis=0).tolist())
124+
###save and normalize features
125+
min_max_scaler1 = preprocessing.MinMaxScaler()
126+
phage_features_norm = min_max_scaler1.fit_transform(phage_features)
127+
min_max_scaler2 = preprocessing.MinMaxScaler()
128+
host_features_norm = min_max_scaler2.fit_transform(host_features)
129+
for pp in range(len(phages)):
130+
np.savetxt('../data/phage_protein_normfeatures/'+phages[pp]+'.txt',phage_features_norm[pp,:])
131+
for hh in range(len(hosts)):
132+
np.savetxt('../data/host_protein_normfeatures/'+hosts[hh]+'.txt',host_features_norm[hh,:])
133+
134+
135+
136+
137+
138+
139+
140+
141+
142+
143+

0 commit comments

Comments
 (0)