Given a set of words (or patterns), how can we find out if a string is a concatenation of these words (repetition is allowed). For example:
W = {"12", "122"}
s = "12122" -> match
s = "1122" -> miss
s = "1221212" -> match
I came up with a crappy solution O(n^2)
bool slow_match(const std::string &s, const std::unordered_set<std::string> &dict) {
std::vector<int> exists(s.size(), 0);
for (int i = 0; i < s.size(); ++i) {
if (dict.find(s.substr(0, i + 1)) != dict.end()) {
exists[i] = i + 1;
}
for (int j = 0; j < i && exists[i] == 0; ++j) {
if (exists[j] != 0 && dict.find(s.substr(j + 1, i - j)) != dict.end()) {
exists[i] = i - j;
}
}
}
return exists[s.size() - 1];
}
So is there a faster way to check if a match occurs? Iām trying to use Aho Corasick (from Codechef August Long Contest 2013 - Music & Lyrics), but as long as there is a match for any word, it matches the whole text! Any idea?
This is what I got so far:
#include <iostream>
#include <fstream>
#include <sstream>
#include <iomanip>
#include <cstdio>
#include <cstdlib>
#include <algorithm>
#include <functional>
#include <utility>
#include <memory>
#include <cassert>
#include <cctype>
#include <exception>
#include <stdexcept>
#include <string>
#include <cstring>
#include <limits>
#include <climits>
#include <numeric>
#include <cmath>
#include <vector>
#include <list>
#include <stack>
#include <deque>
#include <queue>
#include <map>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include <cstring>
const int MAX_STATES = 100;
const int CHARSET = 10;
class aho_corasick {
// deterministics finite automaton
int dfa[MAX_STATES][CHARSET];
int fail[MAX_STATES];
bool accepting[MAX_STATES];
// current state
int ct;
public:
aho_corasick() {
ct = 0;
for (int s = 0; s < MAX_STATES; ++s) {
accepting[s] = false;
for (int c = 0; c < CHARSET; ++c) {
dfa[s][c] = -1;
}
}
}
void build_fail_function() {
// fail to root is root
fail[0] = 0;
for (int a = 0; a < CHARSET; ++a) {
if (dfa[0][a] != -1) {
// make all valid transition from root fall
// back to root
fail[dfa[0][a]] = 0;
} else {
dfa[0][a] = 0;
}
}
}
void build_dfa() {
build_fail_function();
std::queue<int> q;
for (int a = 0; a < CHARSET; ++a) {
if (dfa[0][a] > 0) {
q.push(dfa[0][a]);
}
}
while (!q.empty()) {
int next_state = q.front();
q.pop();
// for all transitions starting from next_state
for (int a = 0; a < CHARSET; ++a) {
if (dfa[next_state][a] > 0) {
fail[dfa[next_state][a]] = dfa[fail[next_state]][a];
q.push(dfa[next_state][a]);
} else {
dfa[next_state][a] = dfa[fail[next_state]][a];
}
}
}
}
int map(char c) const {
return c - '0';
}
int new_state() {
return ct++;
}
bool match(const std::string &text) const {
int s = 0;
int a;
for (int i = 0, sz = text.size(); i < sz; ++i) {
a = map(text[i]);
s = dfa[s][a];
// std::cout << "s = " << s << "\n";
}
return accepting[s];
}
void add_pattern(const std::string &pattern) {
int s = 0;
int a;
for (int i = 0, sz = pattern.size(); i < sz; ++i) {
a = map(pattern[i]);
if (dfa[s][a] == -1) {
dfa[s][a] = ct++;
for (int b = 0; b < CHARSET; ++b) {
dfa[ct][b] = -1;
}
}
// nove to next state
s = dfa[s][a];
std::cout << "s: " << s << "\n";
}
accepting[s] = true;
}
};
void test() {
aho_corasick ac;
ac.add_pattern("12");
ac.add_pattern("122");
ac.build_dfa();
std::cout << ac.match("5100") << "\n";
std::cout << ac.match("1266") << "\n";
}
int main() {
test();
return 0;
}