Improve issue & code search (#33860)
Some checks are pending
release-nightly / nightly-binary (push) Waiting to run
release-nightly / nightly-docker-rootful (push) Waiting to run
release-nightly / nightly-docker-rootless (push) Waiting to run

Each "indexer" should provide the "search modes" they support by
themselves. And we need to remove the "fuzzy" search for code.
This commit is contained in:
wxiaoguang
2025-03-13 11:07:48 +08:00
committed by GitHub
parent cd10456664
commit 403775e74e
31 changed files with 317 additions and 172 deletions

View File

@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
"code.gitea.io/gitea/modules/indexer"
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
@ -136,6 +137,10 @@ type Indexer struct {
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new bleve local indexer
func NewIndexer(indexDir string) *Indexer {
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
@ -267,19 +272,18 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)
keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword)
if isPhrase {
q := bleve.NewMatchPhraseQuery(keywordAsPhrase)
if opts.SearchMode == indexer.SearchModeExact {
q := bleve.NewMatchPhraseQuery(opts.Keyword)
q.FieldVal = "Content"
if opts.IsKeywordFuzzy {
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase)
}
contentQuery = q
} else {
} else /* words */ {
q := bleve.NewMatchQuery(opts.Keyword)
q.FieldVal = "Content"
if opts.IsKeywordFuzzy {
if opts.SearchMode == indexer.SearchModeFuzzy {
// this logic doesn't seem right, it is only used to pass the test-case `Keyword: "dESCRIPTION"`, which doesn't seem to be a real-life use-case.
q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
} else {
q.Operator = query.MatchQueryOperatorAnd
}
contentQuery = q
}

View File

@ -16,6 +16,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
"code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_elasticsearch "code.gitea.io/gitea/modules/indexer/internal/elasticsearch"
@ -24,7 +25,6 @@ import (
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"
"code.gitea.io/gitea/modules/util"
"github.com/go-enry/go-enry/v2"
"github.com/olivere/elastic/v7"
@ -46,6 +46,10 @@ type Indexer struct {
indexer_internal.Indexer // do not composite inner_elasticsearch.Indexer directly to avoid exposing too much
}
func (b *Indexer) SupportedSearchModes() []indexer.SearchMode {
return indexer.SearchModesExactWords()
}
// NewIndexer creates a new elasticsearch indexer
func NewIndexer(url, indexerName string) *Indexer {
inner := inner_elasticsearch.NewIndexer(url, indexerName, esRepoIndexerLatestVersion, defaultMapping)
@ -361,15 +365,10 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan
// Search searches for codes and language stats by given conditions.
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var contentQuery elastic.Query
keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword)
if isPhrase {
contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase)
} else {
// TODO: this is the old logic, but not really using "fuzziness"
// * IsKeywordFuzzy=true: "best_fields"
// * IsKeywordFuzzy=false: "phrase_prefix"
contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).
Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix))
if opts.SearchMode == indexer.SearchModeExact {
contentQuery = elastic.NewMatchPhraseQuery("content", opts.Keyword)
} else /* words */ {
contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword).Type(esMultiMatchTypeBestFields).Operator("and")
}
kwQuery := elastic.NewBoolQuery().Should(
contentQuery,

View File

@ -9,6 +9,7 @@ import (
"strings"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/indexer"
code_indexer "code.gitea.io/gitea/modules/indexer/code"
"code.gitea.io/gitea/modules/setting"
)
@ -23,11 +24,16 @@ func indexSettingToGitGrepPathspecList() (list []string) {
return list
}
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) {
// TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior
func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, searchMode indexer.SearchModeType) (searchResults []*code_indexer.Result, total int, err error) {
grepMode := git.GrepModeWords
if searchMode == indexer.SearchModeExact {
grepMode = git.GrepModeExact
} else if searchMode == indexer.SearchModeRegexp {
grepMode = git.GrepModeRegexp
}
res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{
ContextLineNumber: 1,
IsFuzzy: isFuzzy,
GrepMode: grepMode,
RefName: ref.String(),
PathspecList: indexSettingToGitGrepPathspecList(),
})

View File

@ -14,6 +14,7 @@ import (
"code.gitea.io/gitea/models/db"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/bleve"
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
"code.gitea.io/gitea/modules/indexer/code/internal"
@ -302,3 +303,11 @@ func populateRepoIndexer(ctx context.Context) {
}
log.Info("Done (re)populating the repo indexer with existing repositories")
}
func SupportedSearchModes() []indexer.SearchMode {
gi := globalIndexer.Load()
if gi == nil {
return nil
}
return (*gi).SupportedSearchModes()
}

View File

@ -11,6 +11,7 @@ import (
"code.gitea.io/gitea/models/db"
"code.gitea.io/gitea/models/unittest"
indexer_module "code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/code/bleve"
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
"code.gitea.io/gitea/modules/indexer/code/internal"
@ -39,10 +40,11 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
assert.NoError(t, setupRepositoryIndexes(t.Context(), indexer))
keywords := []struct {
RepoIDs []int64
Keyword string
Langs int
Results []codeSearchResult
RepoIDs []int64
Keyword string
Langs int
SearchMode indexer_module.SearchModeType
Results []codeSearchResult
}{
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
@ -183,9 +185,10 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
// Search for matches on the contents of files regardless of case.
{
RepoIDs: nil,
Keyword: "dESCRIPTION",
Langs: 1,
RepoIDs: nil,
Keyword: "dESCRIPTION",
Langs: 1,
SearchMode: indexer_module.SearchModeFuzzy,
Results: []codeSearchResult{
{
Filename: "README.md",
@ -193,7 +196,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
// Search for an exact match on the filename within the repo '62' (case insenstive).
// Search for an exact match on the filename within the repo '62' (case-insensitive).
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
@ -206,7 +209,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
// Search for matches on the contents of files when the criteria is a expression.
// Search for matches on the contents of files when the criteria are an expression.
{
RepoIDs: []int64{62},
Keyword: "console.log",
@ -218,7 +221,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
// Search for matches on the contents of files when the criteria is part of a expression.
// Search for matches on the contents of files when the criteria are parts of an expression.
{
RepoIDs: []int64{62},
Keyword: "log",
@ -235,16 +238,16 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
for _, kw := range keywords {
t.Run(kw.Keyword, func(t *testing.T) {
total, res, langs, err := indexer.Search(t.Context(), &internal.SearchOptions{
RepoIDs: kw.RepoIDs,
Keyword: kw.Keyword,
RepoIDs: kw.RepoIDs,
Keyword: kw.Keyword,
SearchMode: kw.SearchMode,
Paginator: &db.ListOptions{
Page: 1,
PageSize: 10,
},
IsKeywordFuzzy: true,
})
assert.NoError(t, err)
assert.Len(t, langs, kw.Langs)
require.NoError(t, err)
require.Len(t, langs, kw.Langs)
hits := make([]codeSearchResult, 0, len(res))
@ -289,7 +292,7 @@ func TestBleveIndexAndSearch(t *testing.T) {
_, err := idx.Init(t.Context())
require.NoError(t, err)
testIndexer("beleve", t, idx)
testIndexer("bleve", t, idx)
}
func TestESIndexAndSearch(t *testing.T) {

View File

@ -9,6 +9,7 @@ import (
"code.gitea.io/gitea/models/db"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/indexer"
"code.gitea.io/gitea/modules/indexer/internal"
)
@ -18,6 +19,7 @@ type Indexer interface {
Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error
Delete(ctx context.Context, repoID int64) error
Search(ctx context.Context, opts *SearchOptions) (int64, []*SearchResult, []*SearchResultLanguages, error)
SupportedSearchModes() []indexer.SearchMode
}
type SearchOptions struct {
@ -25,7 +27,7 @@ type SearchOptions struct {
Keyword string
Language string
IsKeywordFuzzy bool
SearchMode indexer.SearchModeType
db.Paginator
}
@ -41,6 +43,10 @@ type dummyIndexer struct {
internal.Indexer
}
func (d *dummyIndexer) SupportedSearchModes() []indexer.SearchMode {
return nil
}
func (d *dummyIndexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *RepoChanges) error {
return fmt.Errorf("indexer is not ready")
}

View File

@ -10,9 +10,7 @@ import (
"code.gitea.io/gitea/modules/log"
)
const (
filenameMatchNumberOfLines = 7 // Copied from github search
)
const filenameMatchNumberOfLines = 7 // Copied from GitHub search
func FilenameIndexerID(repoID int64, filename string) string {
return internal.Base36(repoID) + "_" + filename
@ -48,11 +46,3 @@ func FilenameMatchIndexPos(content string) (int, int) {
}
return 0, len(content)
}
func ParseKeywordAsPhrase(keyword string) (string, bool) {
if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) && len(keyword) > 1 {
// only remove the prefix and suffix quotes, no need to decode the content at the moment
return keyword[1 : len(keyword)-1], true
}
return "", false
}

View File

@ -1,30 +0,0 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package internal
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestParseKeywordAsPhrase(t *testing.T) {
cases := []struct {
keyword string
phrase string
isPhrase bool
}{
{``, "", false},
{`a`, "", false},
{`"`, "", false},
{`"a`, "", false},
{`"a"`, "a", true},
{`""\"""`, `"\""`, true},
}
for _, c := range cases {
phrase, isPhrase := ParseKeywordAsPhrase(c.keyword)
assert.Equal(t, c.phrase, phrase, "keyword=%q", c.keyword)
assert.Equal(t, c.isPhrase, isPhrase, "keyword=%q", c.keyword)
}
}

View File

@ -129,7 +129,6 @@ func searchResult(result *internal.SearchResult, startIndex, endIndex int) (*Res
}
// PerformSearch perform a search on a repository
// if isFuzzy is true set the Damerau-Levenshtein distance from 0 to 2
func PerformSearch(ctx context.Context, opts *SearchOptions) (int, []*Result, []*SearchResultLanguages, error) {
if opts == nil || len(opts.Keyword) == 0 {
return 0, nil, nil, nil