Analyzers & Tokenizers với .NET

Analyzer là gì?

Analyzer quyết định cách Elasticsearch phân tích text khi index và khi search.

"Apple iPhone 15 Pro Max"
         ↓  Analyzer
  Tokenizer (tách thành tokens)
         ↓
  Token Filters (lowercase, stop words, stemming)
         ↓
  Inverted Index: ["apple", "iphone", "15", "pro", "max"]

Anatomy of an Analyzer:
┌─────────────────────────────────────────────────────────┐
│                      Analyzer                           │
│  ┌──────────────┐  ┌────────────┐  ┌────────────────┐  │
│  │ Char Filters │→ │ Tokenizer  │→ │ Token Filters  │  │
│  │ (pre-process)│  │ (split)    │  │ (transform)    │  │
│  └──────────────┘  └────────────┘  └────────────────┘  │
└─────────────────────────────────────────────────────────┘

Built-in Analyzers

// Test analyzer trong .NET
public async Task TestAnalyzerAsync()
{
    // Test một built-in analyzer
    var response = await _es.Indices.AnalyzeAsync(a => a
        .Analyzer("english")
        .Text("The quick brown foxes are running")
    );

    // Tokens: ["quick", "brown", "fox", "run"] - stemming + stop words
    foreach (var token in response.Tokens ?? [])
        Console.WriteLine($"{token.Token} (pos: {token.Position})");
}

Analyzer	Mô tả	Ví dụ output
`standard`	Mặc định - tách theo Unicode, lowercase	“Hello World” → [“hello”, “world”]
`simple`	Tách theo ký tự không phải chữ	“IP 192.168.1.1” → [“ip”]
`whitespace`	Tách theo khoảng trắng	“foo bar” → [“foo”, “bar”]
`english`	Stemming tiếng Anh + stop words	“running foxes” → [“run”, “fox”]
`keyword`	Không tách, giữ nguyên	“Hello World” → [“Hello World”]

Custom Analyzer trong .NET

public async Task CreateIndexWithCustomAnalyzerAsync()
{
    await _es.Indices.CreateAsync<Product>("products", c => c
        .Settings(s => s
            .Analysis(a => a
                // 1. Định nghĩa Char Filters
                .CharFilters(cf => cf
                    .Mapping("remove_special_chars", m => m
                        .Mappings(new[] { "& => and", "@ => at" })
                    )
                )
                // 2. Định nghĩa Tokenizers
                .Tokenizers(t => t
                    .EdgeNGram("edge_ngram_tokenizer", en => en
                        .MinGram(2)
                        .MaxGram(10)
                        .TokenChars(new[] { TokenChar.Letter, TokenChar.Digit })
                    )
                )
                // 3. Định nghĩa Token Filters
                .TokenFilters(tf => tf
                    .Synonym("my_synonyms", syn => syn
                        .Synonyms(new[]
                        {
                            "phone, smartphone, mobile",
                            "laptop, notebook, computer"
                        })
                    )
                    .Stop("my_stop", st => st
                        .StopWords(new[] { "a", "an", "the", "is", "are" })
                    )
                    .Stemmer("english_stemmer", st => st
                        .Language("english")
                    )
                )
                // 4. Tạo Custom Analyzers
                .Analyzers(an => an
                    // Analyzer cho search (full pipeline)
                    .Custom("product_search_analyzer", ca => ca
                        .CharFilter(new[] { "remove_special_chars" })
                        .Tokenizer("standard")
                        .Filter(new[] { "lowercase", "my_stop", "my_synonyms", "english_stemmer" })
                    )
                    // Analyzer cho indexing (edge ngram để autocomplete)
                    .Custom("product_index_analyzer", ca => ca
                        .Tokenizer("edge_ngram_tokenizer")
                        .Filter(new[] { "lowercase" })
                    )
                    // Analyzer đơn giản cho autocomplete
                    .Custom("autocomplete_analyzer", ca => ca
                        .Tokenizer("standard")
                        .Filter(new[] { "lowercase", "autocomplete_filter" })
                    )
                )
                .TokenFilters(tf => tf
                    .EdgeNGram("autocomplete_filter", en => en
                        .MinGram(1)
                        .MaxGram(20)
                    )
                )
            )
        )
        .Mappings(m => m
            .Properties(p => p
                .Text(t => t
                    .Name(n => n.Name)
                    .Analyzer("product_index_analyzer")   // Index time
                    .SearchAnalyzer("product_search_analyzer") // Search time (khác!)
                    .Fields(f => f
                        .Keyword(k => k.Name("keyword"))
                        .Text(txt => txt
                            .Name("autocomplete")
                            .Analyzer("autocomplete_analyzer")
                            .SearchAnalyzer("standard")
                        )
                    )
                )
            )
        )
    );
}

Autocomplete với .NET

Pattern 1: Edge N-Gram Tokenizer

// Khi user gõ "ipho" → tìm "iphone"
// Edge N-Gram index "iphone": "i", "ip", "iph", "ipho", "iphon", "iphone"

public async Task<List<string>> AutocompleteAsync(string prefix)
{
    var response = await _es.SearchAsync<Product>(s => s
        .Source(src => src.Includes(i => i.Fields(f => f.Name)))
        .Query(q => q
            .Match(m => m
                .Field("name.autocomplete") // Dùng field với autocomplete analyzer
                .Query(prefix)
            )
        )
        .Size(10)
    );

    return response.Hits
        .Select(h => h.Source?.Name ?? "")
        .Where(n => !string.IsNullOrEmpty(n))
        .Distinct()
        .ToList();
}

Pattern 2: Search-as-you-type Field

// search_as_you_type: field type đặc biệt cho autocomplete
.Mappings(m => m
    .Properties(p => p
        .SearchAsYouType(s => s
            .Name(n => n.Name)
            .MaxShingleSize(4)
        )
    )
)

// Query
var response = await _es.SearchAsync<Product>(s => s
    .Query(q => q
        .MultiMatch(mm => mm
            .Fields(new[]
            {
                "name",
                "name._2gram",
                "name._3gram",
                "name._index_prefix"
            })
            .Query(prefix)
            .Type(TextQueryType.BoolPrefix)
        )
    )
    .Size(10)
);

Test Analyzer từ .NET

public async Task<List<string>> GetTokensAsync(
    string text,
    string analyzer = "standard",
    string? indexName = null)
{
    var response = indexName is not null
        ? await _es.Indices.AnalyzeAsync(a => a
            .Index(indexName)
            .Analyzer(analyzer)
            .Text(text)
          )
        : await _es.Indices.AnalyzeAsync(a => a
            .Analyzer(analyzer)
            .Text(text)
          );

    return response.Tokens?
        .Select(t => t.Token)
        .ToList() ?? [];
}

// Sử dụng để debug
var tokens = await GetTokensAsync(
    "The iPhones are amazing smartphones!",
    "english"
);
// Output: ["iphon", "amaz", "smartphon"]  (stemmed + stop words removed)

Language Analyzers

// Cho nội dung tiếng Việt - dùng icu_analysis hoặc custom
// Plugin: elasticsearch-analysis-icu

await _es.Indices.CreateAsync("products-vi", c => c
    .Settings(s => s
        .Analysis(a => a
            .Analyzers(an => an
                .Custom("vi_analyzer", ca => ca
                    .Tokenizer("icu_tokenizer")    // Unicode-aware tokenizer
                    .Filter(new[] { "icu_normalizer", "lowercase" })
                )
            )
        )
    )
    .Mappings(m => m
        .Properties(p => p
            .Text(t => t.Name("name").Analyzer("vi_analyzer"))
        )
    )
);

Keyboard shortcuts

Learning