字符串算法:从KMP到后缀数组的进阶之路

字符串算法的重要性
📈 字符串匹配算法性能对比
算法 | 预处理时间 | 匹配时间 | 空间复杂度 | 最佳场景 |
---|---|---|---|---|
暴力匹配 | 0 | O(nm) | O(1) | 短文本 |
KMP算法 | O(m) | O(n) | O(m) | 重复模式 |
Boyer-Moore | O(m+σ) | O(n/m) | O(σ) | 长文本搜索 |
后缀数组 | O(n log n) | O(m log n) | O(n) | 多次查询 |
字符串处理是信息学竞赛的重要组成部分。从简单的模式匹配到复杂的文本分析,字符串算法无处不在。掌握高效的字符串算法,不仅能解决竞赛中的字符串题目,更是理解计算机科学中许多高级概念的基础。
字符串算法知识体系
算法类别 | 核心算法 | 时间复杂度 | 应用场景 |
---|---|---|---|
模式匹配 | KMP、BM、Sunday | O(n+m) | 查找子串 |
字符串哈希 | Rolling Hash、Rabin-Karp | O(n) | 快速比较 |
字典树 | Trie、AC自动机 | O(n) | 多模式匹配 |
后缀结构 | 后缀数组、后缀树 | O(nlogn) | 子串问题 |
回文算法 | Manacher | O(n) | 回文串 |
KMP算法详解
KMP的核心思想
KMP算法通过预处理模式串,构建部分匹配表(next数组),在失配时利用已匹配的信息避免重复比较。
next数组的构建
vector<int> buildNext(string pattern) {
int m = pattern.length();
vector<int> next(m, 0);
int j = 0; // 前缀的末尾
for (int i = 1; i < m; i++) {
// 失配时回退
while (j > 0 && pattern[i] != pattern[j]) {
j = next[j - 1];
}
// 匹配时前缀长度增加
if (pattern[i] == pattern[j]) {
j++;
}
next[i] = j;
}
return next;
}
KMP匹配过程
vector<int> KMP(string text, string pattern) {
vector<int> next = buildNext(pattern);
vector<int> matches;
int n = text.length();
int m = pattern.length();
int j = 0; // pattern的匹配位置
for (int i = 0; i < n; i++) {
// 失配时根据next数组回退
while (j > 0 && text[i] != pattern[j]) {
j = next[j - 1];
}
// 字符匹配
if (text[i] == pattern[j]) {
j++;
}
// 完全匹配
if (j == m) {
matches.push_back(i - m + 1);
j = next[j - 1]; // 继续寻找下一个匹配
}
}
return matches;
}
KMP的应用与扩展
// 应用1:最短循环节
int shortestPeriod(string s) {
int n = s.length();
vector<int> next = buildNext(s);
int period = n - next[n - 1];
if (n % period == 0) {
return period;
}
return n;
}
// 应用2:统计前缀出现次数
vector<int> countPrefixOccurrences(string s) {
int n = s.length();
vector<int> next = buildNext(s);
vector<int> count(n + 1, 0);
for (int i = 0; i < n; i++) {
count[next[i]]++;
}
// 累加,因为长前缀包含短前缀
for (int i = n - 1; i > 0; i--) {
count[next[i - 1]] += count[i];
}
for (int i = 0; i <= n; i++) {
count[i]++; // 加上自身
}
return count;
}
字符串哈希
Rolling Hash实现
class StringHash {
private:
static const int MOD1 = 1e9 + 7;
static const int MOD2 = 1e9 + 9;
static const int BASE1 = 131;
static const int BASE2 = 137;
vector<long long> hash1, hash2;
vector<long long> pow1, pow2;
public:
StringHash(string s) {
int n = s.length();
hash1.resize(n + 1);
hash2.resize(n + 1);
pow1.resize(n + 1);
pow2.resize(n + 1);
pow1[0] = pow2[0] = 1;
for (int i = 0; i < n; i++) {
hash1[i + 1] = (hash1[i] * BASE1 + s[i]) % MOD1;
hash2[i + 1] = (hash2[i] * BASE2 + s[i]) % MOD2;
pow1[i + 1] = pow1[i] * BASE1 % MOD1;
pow2[i + 1] = pow2[i] * BASE2 % MOD2;
}
}
// 获取子串[l, r)的哈希值
pair<long long, long long> getHash(int l, int r) {
long long h1 = (hash1[r] - hash1[l] * pow1[r - l] % MOD1 + MOD1) % MOD1;
long long h2 = (hash2[r] - hash2[l] * pow2[r - l] % MOD2 + MOD2) % MOD2;
return {h1, h2};
}
// 判断两个子串是否相等
bool equal(int l1, int r1, int l2, int r2) {
return getHash(l1, r1) == getHash(l2, r2);
}
};
字符串哈希的应用
// 应用1:最长公共子串(二分+哈希)
int longestCommonSubstring(string s1, string s2) {
StringHash hash1(s1), hash2(s2);
int n = s1.length(), m = s2.length();
int left = 0, right = min(n, m);
int result = 0;
while (left <= right) {
int mid = (left + right) / 2;
unordered_set<pair<long long, long long>, hash<pair<long long, long long>>> seen;
bool found = false;
// 枚举s1的所有长度为mid的子串
for (int i = 0; i <= n - mid; i++) {
seen.insert(hash1.getHash(i, i + mid));
}
// 检查s2中是否存在
for (int i = 0; i <= m - mid; i++) {
if (seen.count(hash2.getHash(i, i + mid))) {
found = true;
break;
}
}
if (found) {
result = mid;
left = mid + 1;
} else {
right = mid - 1;
}
}
return result;
}
// 应用2:判断字符串循环同构
bool isRotation(string s1, string s2) {
if (s1.length() != s2.length()) return false;
string doubled = s1 + s1;
return KMP(doubled, s2).size() > 0;
}
字典树(Trie)
基础Trie实现
class TrieNode {
public:
TrieNode* children[26];
bool isEnd;
int count; // 经过该节点的单词数
TrieNode() {
memset(children, 0, sizeof(children));
isEnd = false;
count = 0;
}
};
class Trie {
private:
TrieNode* root;
public:
Trie() {
root = new TrieNode();
}
void insert(string word) {
TrieNode* node = root;
for (char c : word) {
int idx = c - a;
if (!node->children[idx]) {
node->children[idx] = new TrieNode();
}
node = node->children[idx];
node->count++;
}
node->isEnd = true;
}
bool search(string word) {
TrieNode* node = root;
for (char c : word) {
int idx = c - a;
if (!node->children[idx]) {
return false;
}
node = node->children[idx];
}
return node->isEnd;
}
bool startsWith(string prefix) {
TrieNode* node = root;
for (char c : prefix) {
int idx = c - a;
if (!node->children[idx]) {
return false;
}
node = node->children[idx];
}
return true;
}
// 统计以prefix为前缀的单词数
int countPrefix(string prefix) {
TrieNode* node = root;
for (char c : prefix) {
int idx = c - a;
if (!node->children[idx]) {
return 0;
}
node = node->children[idx];
}
return node->count;
}
};
01-Trie(处理异或问题)
class BinaryTrie {
struct Node {
Node* children[2];
int count;
Node() : count(0) {
children[0] = children[1] = nullptr;
}
};
Node* root;
static const int MAX_BIT = 30;
public:
BinaryTrie() {
root = new Node();
}
void insert(int num) {
Node* node = root;
for (int i = MAX_BIT; i >= 0; i--) {
int bit = (num >> i) & 1;
if (!node->children[bit]) {
node->children[bit] = new Node();
}
node = node->children[bit];
node->count++;
}
}
int findMaxXor(int num) {
Node* node = root;
int result = 0;
for (int i = MAX_BIT; i >= 0; i--) {
int bit = (num >> i) & 1;
int toggled = 1 - bit;
if (node->children[toggled] && node->children[toggled]->count > 0) {
result |= (1 << i);
node = node->children[toggled];
} else {
node = node->children[bit];
}
}
return result;
}
};
AC自动机
AC自动机是Trie树和KMP算法的结合,用于多模式串匹配。
class ACAutomaton {
struct Node {
Node* children[26];
Node* fail;
vector<int> output; // 在该节点结束的模式串编号
Node() {
memset(children, 0, sizeof(children));
fail = nullptr;
}
};
Node* root;
public:
ACAutomaton() {
root = new Node();
}
void insert(string pattern, int id) {
Node* node = root;
for (char c : pattern) {
int idx = c - a;
if (!node->children[idx]) {
node->children[idx] = new Node();
}
node = node->children[idx];
}
node->output.push_back(id);
}
void buildFailureLinks() {
queue<Node*> q;
// 第一层的失败指针指向根
for (int i = 0; i < 26; i++) {
if (root->children[i]) {
root->children[i]->fail = root;
q.push(root->children[i]);
}
}
while (!q.empty()) {
Node* curr = q.front();
q.pop();
for (int i = 0; i < 26; i++) {
if (curr->children[i]) {
Node* child = curr->children[i];
Node* failNode = curr->fail;
while (failNode && !failNode->children[i]) {
failNode = failNode->fail;
}
child->fail = failNode ? failNode->children[i] : root;
// 合并输出
for (int id : child->fail->output) {
child->output.push_back(id);
}
q.push(child);
}
}
}
}
vector<pair<int, int>> search(string text) {
vector<pair<int, int>> matches;
Node* node = root;
for (int i = 0; i < text.length(); i++) {
int idx = text[i] - a;
while (node && !node->children[idx]) {
node = node->fail;
}
node = node ? node->children[idx] : root;
for (int id : node->output) {
matches.push_back({i, id});
}
}
return matches;
}
};
后缀数组
后缀数组是处理字符串子串问题的强大工具。
class SuffixArray {
vector<int> sa, rank, lcp;
string s;
int n;
void buildSA() {
vector<int> cnt(256, 0);
vector<int> tmpRank(n);
// 计数排序
for (int i = 0; i < n; i++) cnt[s[i]]++;
for (int i = 1; i < 256; i++) cnt[i] += cnt[i - 1];
for (int i = n - 1; i >= 0; i--) sa[--cnt[s[i]]] = i;
rank[sa[0]] = 0;
for (int i = 1; i < n; i++) {
rank[sa[i]] = rank[sa[i - 1]];
if (s[sa[i]] != s[sa[i - 1]]) rank[sa[i]]++;
}
for (int k = 1; k < n; k <<= 1) {
// 按第二关键字排序
int p = 0;
for (int i = n - k; i < n; i++) tmpRank[p++] = i;
for (int i = 0; i < n; i++) {
if (sa[i] >= k) tmpRank[p++] = sa[i] - k;
}
// 按第一关键字排序
cnt.assign(n, 0);
for (int i = 0; i < n; i++) cnt[rank[i]]++;
for (int i = 1; i < n; i++) cnt[i] += cnt[i - 1];
for (int i = n - 1; i >= 0; i--) sa[--cnt[rank[tmpRank[i]]]] = tmpRank[i];
// 更新rank
swap(rank, tmpRank);
rank[sa[0]] = 0;
for (int i = 1; i < n; i++) {
rank[sa[i]] = rank[sa[i - 1]];
if (tmpRank[sa[i]] != tmpRank[sa[i - 1]] ||
tmpRank[sa[i] + k] != tmpRank[sa[i - 1] + k]) {
rank[sa[i]]++;
}
}
if (rank[sa[n - 1]] == n - 1) break;
}
}
void buildLCP() {
int h = 0;
lcp.resize(n);
for (int i = 0; i < n; i++) {
if (rank[i] == 0) continue;
int j = sa[rank[i] - 1];
while (i + h < n && j + h < n && s[i + h] == s[j + h]) h++;
lcp[rank[i]] = h;
if (h > 0) h--;
}
}
public:
SuffixArray(string str) : s(str), n(str.length()) {
sa.resize(n);
rank.resize(n);
buildSA();
buildLCP();
}
// 查找子串出现次数
int countOccurrences(string pattern) {
int m = pattern.length();
int left = 0, right = n - 1;
// 二分查找第一个出现位置
while (left < right) {
int mid = (left + right) / 2;
if (s.substr(sa[mid], m) < pattern) {
left = mid + 1;
} else {
right = mid;
}
}
if (s.substr(sa[left], m) != pattern) return 0;
int first = left;
// 二分查找最后一个出现位置
right = n - 1;
while (left < right) {
int mid = (left + right + 1) / 2;
if (s.substr(sa[mid], m) <= pattern) {
left = mid;
} else {
right = mid - 1;
}
}
return left - first + 1;
}
};
Manacher算法(回文串)
class Manacher {
string preprocess(string s) {
string t = "#";
for (char c : s) {
t += c;
t += "#";
}
return t;
}
public:
vector<int> findPalindromes(string s) {
string t = preprocess(s);
int n = t.length();
vector<int> p(n, 0);
int center = 0, right = 0;
for (int i = 0; i < n; i++) {
int mirror = 2 * center - i;
if (i < right) {
p[i] = min(right - i, p[mirror]);
}
// 尝试扩展
while (i + p[i] + 1 < n && i - p[i] - 1 >= 0 &&
t[i + p[i] + 1] == t[i - p[i] - 1]) {
p[i]++;
}
// 更新中心和右边界
if (i + p[i] > right) {
center = i;
right = i + p[i];
}
}
return p;
}
string longestPalindrome(string s) {
vector<int> p = findPalindromes(s);
int maxLen = 0, centerIdx = 0;
for (int i = 0; i < p.size(); i++) {
if (p[i] > maxLen) {
maxLen = p[i];
centerIdx = i;
}
}
int start = (centerIdx - maxLen) / 2;
return s.substr(start, maxLen);
}
};
字符串算法练习题
分类练习
算法 | 题目 | 难度 |
---|---|---|
KMP | 最短循环节 | ★★☆ |
字符串哈希 | 最长公共子串 | ★★★ |
Trie | 最大异或对 | ★★★ |
AC自动机 | 病毒检测 | ★★★★ |
后缀数组 | 不同子串个数 | ★★★★ |
Manacher | 回文子串计数 | ★★★ |
总结
字符串算法是信息学竞赛的重要组成部分,掌握字符串算法需要:
- 理解原理:深入理解每个算法的核心思想
- 熟练实现:能够快速准确地编写代码
- 灵活应用:识别问题类型,选择合适算法
- 组合使用:多种算法配合解决复杂问题
- 优化技巧:掌握时间和空间的优化方法
字符串算法看似复杂,但都有其内在的优美逻辑。通过系统学习和大量练习,你将能够熟练运用这些强大的工具解决各种字符串问题!
答复