Skip to content

Commit e0166f9

Browse files
committed
edit suffix tree
2 parents b7dcd00 + ddd9164 commit e0166f9

File tree

7 files changed

+200
-127
lines changed

7 files changed

+200
-127
lines changed

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ CC=gcc
33
CPP=g++
44
AR=ar
55
RANLIB=ranlib
6-
CFLAGS= -g -Wall -Wno-unused-function
6+
CFLAGS= -g -Wall -Wno-unused-function -std=gnu++0x
77
SRCDIR = ./src
88
INCLUDEDIR = -I./include -I.
99
DEPS =
@@ -69,6 +69,7 @@ PROGRAMS = m_based_demo \
6969
selection_sort_demo \
7070
8queue_demo \
7171
palindrome_demo \
72+
suffix_array_demo \
7273
suffix_tree_demo
7374

7475
all: $(PROGRAMS)

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@
5555
Red-black tree
5656
Interval tree
5757
Prefix Tree(Trie)
58-
*Suffix Tree(未实现)*
58+
Suffix Tree
5959
B-Tree
60+
Suffix Array
6061

6162
Hash by multiplication
6263
Hash table
@@ -96,4 +97,5 @@
9697
wycg1984: for K-Means
9798
xmuliang: for HeapSort, Kruskal MST
9899
wyh267: for base64, LRU, bubble sort, selection sort
99-
ZhangYou0122 : Push-Relabel algorithm
100+
ZhangYou0122: Push-Relabel algorithm, Suffix Tree
101+
UsingtcNower: Suffix Array

include/prime.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ namespace alg {
3535
if (n%2 == 0) return false;
3636

3737
unsigned sqrtn = sqrt(n);
38-
for (unsigned int i = 2; i <= sqrtn; ++i) {
38+
for (unsigned int i = 3; i <= sqrtn; i+=2) {
3939
if (n % i == 0) {
4040
return false;
4141
}

include/suffix_array.h

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*******************************************************************************
2+
* ALGORITHM IMPLEMENTAIONS
3+
*
4+
* /\ | _ _ ._ o _|_ |_ ._ _ _
5+
* /--\ | (_| (_) | | |_ | | | | | _>
6+
* _|
7+
*
8+
* SUFFIX ARRAY
9+
*
10+
* Features:
11+
* suffix array can sort all the suffixs in time complexity O(n*log^2(n)),
12+
* and use memory in O(n). And suffix array can get two suffixs' longest
13+
* common prefix(lcp) in O(log(n)) complexity.
14+
*
15+
* You can test it by running suffix_array_demo.cpp
16+
* Want to get more detailed information about suffix array?
17+
*
18+
* Please google SUFF_AR_ENG.pdf
19+
*
20+
21+
******************************************************************************/
22+
23+
#ifndef __SUFFIX_ARRAY_H__
24+
#define __SUFFIX_ARRAY_H__
25+
26+
#include <algorithm>
27+
#include <vector>
28+
#include <string>
29+
#include <math.h>
30+
31+
using namespace std;
32+
33+
namespace alg {
34+
class SuffixArray {
35+
private:
36+
vector<vector<int> > bucket;
37+
vector<int> suffix;
38+
int N, L, K;
39+
const string& str;
40+
void suffix_sort();
41+
void update_bucket();
42+
43+
bool less_than(int a, int b) {
44+
if(K==0) return str[a]<str[b];
45+
else {
46+
if(bucket[K-1][a]==bucket[K-1][b]) return bucket[K-1][a+L/2]<bucket[K-1][b+L/2];
47+
else return bucket[K-1][a]<bucket[K-1][b];
48+
}
49+
}
50+
51+
bool equal(int a, int b) {
52+
return !less_than(a,b) && !less_than(b,a);
53+
}
54+
55+
public:
56+
explicit SuffixArray(const string& s) : N(s.size()), L(0), K(0), str(s) { suffix_sort();}
57+
// return the sorted suffix
58+
int operator [] (int i) { return suffix[i];}
59+
// Given two suffixs of string, return the longest common prefix length
60+
int lcp_length(int x, int y);
61+
};
62+
63+
void SuffixArray::suffix_sort() {
64+
// init suffix
65+
suffix.resize(N);
66+
for(int i=0;i<N;i++) suffix[i]=i;
67+
// init bucket
68+
bucket.resize(ceil(log2(N))+1);
69+
for(size_t k=0;k<bucket.size();k++) bucket[k].resize(N+N);
70+
71+
for(L=1,K=0;(L>>1)<N;L<<=1,K++) {
72+
sort(suffix.begin(), suffix.end(), bind(&SuffixArray::less_than, *this, placeholders::_1, placeholders::_2));
73+
update_bucket();
74+
}
75+
}
76+
77+
78+
void SuffixArray::update_bucket() {
79+
int seq=0;
80+
bucket[K][suffix[0]]=0;
81+
for(int i=1;i<N;i++) {
82+
if(!equal(suffix[i],suffix[i-1])) seq++;
83+
bucket[K][suffix[i]]=seq;
84+
}
85+
fill(bucket[K].begin()+N, bucket[K].end(), -1);
86+
}
87+
88+
int SuffixArray::lcp_length(int x, int y) {
89+
if(x==y) return N-x;
90+
int ret=0;
91+
for(int k=K-1;k>=0 && x<N && y<N;k--) {
92+
if(bucket[k][x]==bucket[k][y]) {
93+
x += (1<<k);
94+
y += (1<<k);
95+
ret += (1<<k);
96+
}
97+
}
98+
return ret;
99+
}
100+
}
101+
102+
#endif // __SUFFIX_ARRAY_H__

include/suffix_tree.h

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class SuffixTree
2121
{
2222
public:
2323
// active point is initialized as (root, None, 0), remainder initialized as 1
24-
SuffixTree(string str):test_str(str), root(test_str), active_point(&root, NULL, 0), remainder(0), pos(0), base_pos(0), ls() {}
24+
SuffixTree(string str):test_str(str), root(test_str), active_point(&root, 0, 0), remainder(0), pos(0), active_e(0), ls() {}
2525
int construct(void);
2626

2727
// return -1 if no such sub exist, return the beginning postion of this substring in thr original string if it exist
@@ -229,10 +229,10 @@ class SuffixTree
229229
class ActivePoint{
230230
public:
231231
Node* active_node;
232-
Edge* active_edge;
232+
char active_edge;
233233
int active_length;
234234

235-
ActivePoint(Node* node, Edge* edge, int length):
235+
ActivePoint(Node* node, char edge, int length):
236236
active_node(node), active_edge(edge), active_length(length) { std::cout << "ActivePoint initialized" << std::endl; }
237237
};
238238

@@ -241,8 +241,11 @@ class SuffixTree
241241

242242
Node* get_active_node(void) { return active_point.active_node; }
243243
void set_active_node(Node* node) { active_point.active_node = node; cout << "Active node set as " << node << endl; }
244-
Edge* get_active_edge(void) { return active_point.active_edge; }
245-
void set_active_edge(Edge* edge) { active_point.active_edge = edge; }
244+
char get_active_edge(void)
245+
{
246+
return test_str[active_e];
247+
}
248+
246249
int get_active_length(void) { return active_point.active_length; }
247250
void set_active_length(int len) { active_point.active_length = len; }
248251
void inc_active_len() { active_point.active_length++; }
@@ -252,7 +255,7 @@ class SuffixTree
252255
int remainder;
253256
// how many characters inserted?
254257
unsigned int pos;
255-
unsigned int base_pos; // the beginnig position of suffixes need to be inserted
258+
unsigned int active_e; // the beginnig position of suffixes need to be inserted
256259
char get_ele(int i) { return test_str[i]; }
257260
// insert a char from pos to suffix tree
258261
int insert();
@@ -261,36 +264,30 @@ class SuffixTree
261264
int print_node(Node* node, int level);
262265

263266

264-
Node* seperate_edge(Node * node, Edge* edge, int rule);
267+
Node* seperate_edge(Node * node, Edge* edge);
265268

266269
// check if we can change active node
267-
void check_active_node(void)
270+
bool check_active_node(void)
268271
{
269272
Node* node = get_active_node();
270-
Edge* edge = get_active_edge();
273+
char a_char = get_active_edge();
274+
Edge* edge = node->find_edge(a_char);
271275

272276
if (edge == NULL)
273-
return;
277+
return false;
274278

275279
unsigned int edge_size = edge->end - edge->begin + 1;
276280
unsigned int length = get_active_length();
277281

278282
// update
279-
if (edge_size == length) {
283+
if (length >= edge_size) {
280284
set_active_node(edge->endpoint);
281-
set_active_edge(0);
282-
set_active_length(0);
283-
base_pos += edge_size;
284-
}
285-
else if (length > edge_size) {
286285
set_active_length(length-edge_size);
287-
set_active_node(edge->endpoint);
288-
int new_length = get_active_length();
289-
base_pos += edge_size;
290-
Edge *new_active_edge = edge->endpoint->find_edge(get_ele(base_pos));
291-
set_active_edge(new_active_edge);
292-
check_active_node();
286+
active_e += edge_size;
287+
288+
return true;
293289
}
290+
return false;
294291
}
295292

296293
// this class indicate when shall we insert a suffix link

src/suffix_array_demo.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <math.h>
4+
5+
#include "suffix_array.h"
6+
7+
using namespace std;
8+
using namespace alg;
9+
10+
void print(string::iterator b, string::iterator e) {
11+
for(auto it=b;it!=e;++it) cout<<*it;
12+
}
13+
14+
int main()
15+
{
16+
string str;
17+
while(cin>>str) {
18+
SuffixArray sa(str);
19+
cout<<endl;
20+
cout<<"sorted suffixs are:"<<endl;
21+
for(size_t i=0;i<str.size();i++) {
22+
print(str.begin()+sa[i], str.end());
23+
cout<<endl;
24+
}
25+
cout<<endl;
26+
cout<<"The length of the longest common prefix of two suffixs ";
27+
int i=rand()%str.size();
28+
int j=rand()%str.size();
29+
print(str.begin()+i,str.end());
30+
cout<<" and ";
31+
print(str.begin()+j,str.end());
32+
cout<<" is ";
33+
cout<<sa.lcp_length(i,j)<<endl;
34+
cout<<endl;
35+
}
36+
return 0;
37+
}

0 commit comments

Comments
 (0)