字符串算法:从KMP到后缀数组的进阶之路

glasses, book, education, eyeglasses, research, knowledge, text, textbook, information, literature, study, read, open, old, vintage, antique, retro, page, literary, book, book, book, book, book, education, knowledge, study, study

字符串算法的重要性

📈 字符串匹配算法性能对比

算法 预处理时间 匹配时间 空间复杂度 最佳场景
暴力匹配 0 O(nm) O(1) 短文本
KMP算法 O(m) O(n) O(m) 重复模式
Boyer-Moore O(m+σ) O(n/m) O(σ) 长文本搜索
后缀数组 O(n log n) O(m log n) O(n) 多次查询

字符串处理是信息学竞赛的重要组成部分。从简单的模式匹配到复杂的文本分析,字符串算法无处不在。掌握高效的字符串算法,不仅能解决竞赛中的字符串题目,更是理解计算机科学中许多高级概念的基础。

字符串算法知识体系

算法类别 核心算法 时间复杂度 应用场景
模式匹配 KMP、BM、Sunday O(n+m) 查找子串
字符串哈希 Rolling Hash、Rabin-Karp O(n) 快速比较
字典树 Trie、AC自动机 O(n) 多模式匹配
后缀结构 后缀数组、后缀树 O(nlogn) 子串问题
回文算法 Manacher O(n) 回文串

KMP算法详解

KMP的核心思想

KMP算法通过预处理模式串,构建部分匹配表(next数组),在失配时利用已匹配的信息避免重复比较。

next数组的构建


vector<int> buildNext(string pattern) {
    int m = pattern.length();
    vector<int> next(m, 0);
    
    int j = 0;  // 前缀的末尾
    for (int i = 1; i < m; i++) {
        // 失配时回退
        while (j > 0 && pattern[i] != pattern[j]) {
            j = next[j - 1];
        }
        
        // 匹配时前缀长度增加
        if (pattern[i] == pattern[j]) {
            j++;
        }
        
        next[i] = j;
    }
    
    return next;
}

KMP匹配过程


vector<int> KMP(string text, string pattern) {
    vector<int> next = buildNext(pattern);
    vector<int> matches;
    
    int n = text.length();
    int m = pattern.length();
    int j = 0;  // pattern的匹配位置
    
    for (int i = 0; i < n; i++) {
        // 失配时根据next数组回退
        while (j > 0 && text[i] != pattern[j]) {
            j = next[j - 1];
        }
        
        // 字符匹配
        if (text[i] == pattern[j]) {
            j++;
        }
        
        // 完全匹配
        if (j == m) {
            matches.push_back(i - m + 1);
            j = next[j - 1];  // 继续寻找下一个匹配
        }
    }
    
    return matches;
}

KMP的应用与扩展


// 应用1:最短循环节
int shortestPeriod(string s) {
    int n = s.length();
    vector<int> next = buildNext(s);
    
    int period = n - next[n - 1];
    if (n % period == 0) {
        return period;
    }
    return n;
}

// 应用2:统计前缀出现次数
vector<int> countPrefixOccurrences(string s) {
    int n = s.length();
    vector<int> next = buildNext(s);
    vector<int> count(n + 1, 0);
    
    for (int i = 0; i < n; i++) {
        count[next[i]]++;
    }
    
    // 累加,因为长前缀包含短前缀
    for (int i = n - 1; i > 0; i--) {
        count[next[i - 1]] += count[i];
    }
    
    for (int i = 0; i <= n; i++) {
        count[i]++;  // 加上自身
    }
    
    return count;
}

字符串哈希

Rolling Hash实现


class StringHash {
private:
    static const int MOD1 = 1e9 + 7;
    static const int MOD2 = 1e9 + 9;
    static const int BASE1 = 131;
    static const int BASE2 = 137;
    
    vector<long long> hash1, hash2;
    vector<long long> pow1, pow2;
    
public:
    StringHash(string s) {
        int n = s.length();
        hash1.resize(n + 1);
        hash2.resize(n + 1);
        pow1.resize(n + 1);
        pow2.resize(n + 1);
        
        pow1[0] = pow2[0] = 1;
        
        for (int i = 0; i < n; i++) {
            hash1[i + 1] = (hash1[i] * BASE1 + s[i]) % MOD1;
            hash2[i + 1] = (hash2[i] * BASE2 + s[i]) % MOD2;
            pow1[i + 1] = pow1[i] * BASE1 % MOD1;
            pow2[i + 1] = pow2[i] * BASE2 % MOD2;
        }
    }
    
    // 获取子串[l, r)的哈希值
    pair<long long, long long> getHash(int l, int r) {
        long long h1 = (hash1[r] - hash1[l] * pow1[r - l] % MOD1 + MOD1) % MOD1;
        long long h2 = (hash2[r] - hash2[l] * pow2[r - l] % MOD2 + MOD2) % MOD2;
        return {h1, h2};
    }
    
    // 判断两个子串是否相等
    bool equal(int l1, int r1, int l2, int r2) {
        return getHash(l1, r1) == getHash(l2, r2);
    }
};

字符串哈希的应用


// 应用1:最长公共子串(二分+哈希)
int longestCommonSubstring(string s1, string s2) {
    StringHash hash1(s1), hash2(s2);
    int n = s1.length(), m = s2.length();
    
    int left = 0, right = min(n, m);
    int result = 0;
    
    while (left <= right) {
        int mid = (left + right) / 2;
        
        unordered_set<pair<long long, long long>, hash<pair<long long, long long>>> seen;
        bool found = false;
        
        // 枚举s1的所有长度为mid的子串
        for (int i = 0; i <= n - mid; i++) {
            seen.insert(hash1.getHash(i, i + mid));
        }
        
        // 检查s2中是否存在
        for (int i = 0; i <= m - mid; i++) {
            if (seen.count(hash2.getHash(i, i + mid))) {
                found = true;
                break;
            }
        }
        
        if (found) {
            result = mid;
            left = mid + 1;
        } else {
            right = mid - 1;
        }
    }
    
    return result;
}

// 应用2:判断字符串循环同构
bool isRotation(string s1, string s2) {
    if (s1.length() != s2.length()) return false;
    
    string doubled = s1 + s1;
    return KMP(doubled, s2).size() > 0;
}

字典树(Trie)

基础Trie实现


class TrieNode {
public:
    TrieNode* children[26];
    bool isEnd;
    int count;  // 经过该节点的单词数
    
    TrieNode() {
        memset(children, 0, sizeof(children));
        isEnd = false;
        count = 0;
    }
};

class Trie {
private:
    TrieNode* root;
    
public:
    Trie() {
        root = new TrieNode();
    }
    
    void insert(string word) {
        TrieNode* node = root;
        for (char c : word) {
            int idx = c - a;
            if (!node->children[idx]) {
                node->children[idx] = new TrieNode();
            }
            node = node->children[idx];
            node->count++;
        }
        node->isEnd = true;
    }
    
    bool search(string word) {
        TrieNode* node = root;
        for (char c : word) {
            int idx = c - a;
            if (!node->children[idx]) {
                return false;
            }
            node = node->children[idx];
        }
        return node->isEnd;
    }
    
    bool startsWith(string prefix) {
        TrieNode* node = root;
        for (char c : prefix) {
            int idx = c - a;
            if (!node->children[idx]) {
                return false;
            }
            node = node->children[idx];
        }
        return true;
    }
    
    // 统计以prefix为前缀的单词数
    int countPrefix(string prefix) {
        TrieNode* node = root;
        for (char c : prefix) {
            int idx = c - a;
            if (!node->children[idx]) {
                return 0;
            }
            node = node->children[idx];
        }
        return node->count;
    }
};

01-Trie(处理异或问题)


class BinaryTrie {
    struct Node {
        Node* children[2];
        int count;
        Node() : count(0) {
            children[0] = children[1] = nullptr;
        }
    };
    
    Node* root;
    static const int MAX_BIT = 30;
    
public:
    BinaryTrie() {
        root = new Node();
    }
    
    void insert(int num) {
        Node* node = root;
        for (int i = MAX_BIT; i >= 0; i--) {
            int bit = (num >> i) & 1;
            if (!node->children[bit]) {
                node->children[bit] = new Node();
            }
            node = node->children[bit];
            node->count++;
        }
    }
    
    int findMaxXor(int num) {
        Node* node = root;
        int result = 0;
        
        for (int i = MAX_BIT; i >= 0; i--) {
            int bit = (num >> i) & 1;
            int toggled = 1 - bit;
            
            if (node->children[toggled] && node->children[toggled]->count > 0) {
                result |= (1 << i);
                node = node->children[toggled];
            } else {
                node = node->children[bit];
            }
        }
        
        return result;
    }
};

AC自动机

AC自动机是Trie树和KMP算法的结合,用于多模式串匹配。


class ACAutomaton {
    struct Node {
        Node* children[26];
        Node* fail;
        vector<int> output;  // 在该节点结束的模式串编号
        
        Node() {
            memset(children, 0, sizeof(children));
            fail = nullptr;
        }
    };
    
    Node* root;
    
public:
    ACAutomaton() {
        root = new Node();
    }
    
    void insert(string pattern, int id) {
        Node* node = root;
        for (char c : pattern) {
            int idx = c - a;
            if (!node->children[idx]) {
                node->children[idx] = new Node();
            }
            node = node->children[idx];
        }
        node->output.push_back(id);
    }
    
    void buildFailureLinks() {
        queue<Node*> q;
        
        // 第一层的失败指针指向根
        for (int i = 0; i < 26; i++) {
            if (root->children[i]) {
                root->children[i]->fail = root;
                q.push(root->children[i]);
            }
        }
        
        while (!q.empty()) {
            Node* curr = q.front();
            q.pop();
            
            for (int i = 0; i < 26; i++) {
                if (curr->children[i]) {
                    Node* child = curr->children[i];
                    Node* failNode = curr->fail;
                    
                    while (failNode && !failNode->children[i]) {
                        failNode = failNode->fail;
                    }
                    
                    child->fail = failNode ? failNode->children[i] : root;
                    
                    // 合并输出
                    for (int id : child->fail->output) {
                        child->output.push_back(id);
                    }
                    
                    q.push(child);
                }
            }
        }
    }
    
    vector<pair<int, int>> search(string text) {
        vector<pair<int, int>> matches;
        Node* node = root;
        
        for (int i = 0; i < text.length(); i++) {
            int idx = text[i] - a;
            
            while (node && !node->children[idx]) {
                node = node->fail;
            }
            
            node = node ? node->children[idx] : root;
            
            for (int id : node->output) {
                matches.push_back({i, id});
            }
        }
        
        return matches;
    }
};

后缀数组

后缀数组是处理字符串子串问题的强大工具。


class SuffixArray {
    vector<int> sa, rank, lcp;
    string s;
    int n;
    
    void buildSA() {
        vector<int> cnt(256, 0);
        vector<int> tmpRank(n);
        
        // 计数排序
        for (int i = 0; i < n; i++) cnt[s[i]]++;
        for (int i = 1; i < 256; i++) cnt[i] += cnt[i - 1];
        for (int i = n - 1; i >= 0; i--) sa[--cnt[s[i]]] = i;
        
        rank[sa[0]] = 0;
        for (int i = 1; i < n; i++) {
            rank[sa[i]] = rank[sa[i - 1]];
            if (s[sa[i]] != s[sa[i - 1]]) rank[sa[i]]++;
        }
        
        for (int k = 1; k < n; k <<= 1) {
            // 按第二关键字排序
            int p = 0;
            for (int i = n - k; i < n; i++) tmpRank[p++] = i;
            for (int i = 0; i < n; i++) {
                if (sa[i] >= k) tmpRank[p++] = sa[i] - k;
            }
            
            // 按第一关键字排序
            cnt.assign(n, 0);
            for (int i = 0; i < n; i++) cnt[rank[i]]++;
            for (int i = 1; i < n; i++) cnt[i] += cnt[i - 1];
            for (int i = n - 1; i >= 0; i--) sa[--cnt[rank[tmpRank[i]]]] = tmpRank[i];
            
            // 更新rank
            swap(rank, tmpRank);
            rank[sa[0]] = 0;
            for (int i = 1; i < n; i++) {
                rank[sa[i]] = rank[sa[i - 1]];
                if (tmpRank[sa[i]] != tmpRank[sa[i - 1]] ||
                    tmpRank[sa[i] + k] != tmpRank[sa[i - 1] + k]) {
                    rank[sa[i]]++;
                }
            }
            
            if (rank[sa[n - 1]] == n - 1) break;
        }
    }
    
    void buildLCP() {
        int h = 0;
        lcp.resize(n);
        
        for (int i = 0; i < n; i++) {
            if (rank[i] == 0) continue;
            
            int j = sa[rank[i] - 1];
            while (i + h < n && j + h < n && s[i + h] == s[j + h]) h++;
            
            lcp[rank[i]] = h;
            if (h > 0) h--;
        }
    }
    
public:
    SuffixArray(string str) : s(str), n(str.length()) {
        sa.resize(n);
        rank.resize(n);
        buildSA();
        buildLCP();
    }
    
    // 查找子串出现次数
    int countOccurrences(string pattern) {
        int m = pattern.length();
        int left = 0, right = n - 1;
        
        // 二分查找第一个出现位置
        while (left < right) {
            int mid = (left + right) / 2;
            if (s.substr(sa[mid], m) < pattern) {
                left = mid + 1;
            } else {
                right = mid;
            }
        }
        
        if (s.substr(sa[left], m) != pattern) return 0;
        
        int first = left;
        
        // 二分查找最后一个出现位置
        right = n - 1;
        while (left < right) {
            int mid = (left + right + 1) / 2;
            if (s.substr(sa[mid], m) <= pattern) {
                left = mid;
            } else {
                right = mid - 1;
            }
        }
        
        return left - first + 1;
    }
};

Manacher算法(回文串)


class Manacher {
    string preprocess(string s) {
        string t = "#";
        for (char c : s) {
            t += c;
            t += "#";
        }
        return t;
    }
    
public:
    vector<int> findPalindromes(string s) {
        string t = preprocess(s);
        int n = t.length();
        vector<int> p(n, 0);
        
        int center = 0, right = 0;
        
        for (int i = 0; i < n; i++) {
            int mirror = 2 * center - i;
            
            if (i < right) {
                p[i] = min(right - i, p[mirror]);
            }
            
            // 尝试扩展
            while (i + p[i] + 1 < n && i - p[i] - 1 >= 0 &&
                   t[i + p[i] + 1] == t[i - p[i] - 1]) {
                p[i]++;
            }
            
            // 更新中心和右边界
            if (i + p[i] > right) {
                center = i;
                right = i + p[i];
            }
        }
        
        return p;
    }
    
    string longestPalindrome(string s) {
        vector<int> p = findPalindromes(s);
        int maxLen = 0, centerIdx = 0;
        
        for (int i = 0; i < p.size(); i++) {
            if (p[i] > maxLen) {
                maxLen = p[i];
                centerIdx = i;
            }
        }
        
        int start = (centerIdx - maxLen) / 2;
        return s.substr(start, maxLen);
    }
};

字符串算法练习题

分类练习

算法 题目 难度
KMP 最短循环节 ★★☆
字符串哈希 最长公共子串 ★★★
Trie 最大异或对 ★★★
AC自动机 病毒检测 ★★★★
后缀数组 不同子串个数 ★★★★
Manacher 回文子串计数 ★★★

总结

字符串算法是信息学竞赛的重要组成部分,掌握字符串算法需要:

  1. 理解原理:深入理解每个算法的核心思想
  2. 熟练实现:能够快速准确地编写代码
  3. 灵活应用:识别问题类型,选择合适算法
  4. 组合使用:多种算法配合解决复杂问题
  5. 优化技巧:掌握时间和空间的优化方法

字符串算法看似复杂,但都有其内在的优美逻辑。通过系统学习和大量练习,你将能够熟练运用这些强大的工具解决各种字符串问题!

Views: 0

相关文章

答复

您的邮箱地址不会被公开。 必填项已用 * 标注