Exact search (#2135)

* feat: add exact search for quoted phrases/words

* feat: get some highlighting for exact search

* feat: Add exact search for title and text fields in OpenSearch

* simplify and make it work with nlp script

---------

Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com>
Co-authored-by: k00b <k00b@stacker.news>
This commit is contained in:
m0wer 2025-05-15 16:11:58 +02:00 committed by GitHub
parent d7ddfffbf0
commit f12c03198d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 71 additions and 20 deletions

View File

@ -283,25 +283,23 @@ export default {
// if quoted phrases, items must contain entire phrase // if quoted phrases, items must contain entire phrase
for (const quote of quotes) { for (const quote of quotes) {
termQueries.push({
multi_match: {
query: quote,
type: 'phrase',
fields: ['title', 'text']
}
})
// force the search to include the quoted phrase
filters.push({ filters.push({
multi_match: { multi_match: {
query: quote, query: quote,
fields: ['title.exact', 'text.exact'],
type: 'phrase'
}
})
termQueries.push({
multi_match: {
query: quote,
fields: ['title.exact^10', 'text.exact'],
type: 'phrase', type: 'phrase',
fields: ['title', 'text'] boost: 1000
} }
}) })
} }
// functions for boosting search rank by recency or popularity
switch (sort) { switch (sort) {
case 'comments': case 'comments':
functions.push({ functions.push({
@ -389,6 +387,24 @@ export default {
fields: ['title^10', 'text'], fields: ['title^10', 'text'],
boost: 1000 boost: 1000
} }
},
// match on exact fields higher
{
multi_match: {
query,
type: 'best_fields',
fields: ['title.exact^10', 'text.exact'],
boost: 100
}
},
// exact phrase matches higher
{
multi_match: {
query,
fields: ['title.exact^10', 'text.exact'],
type: 'phrase',
boost: 10000
}
} }
] ]
@ -452,7 +468,9 @@ export default {
highlight: { highlight: {
fields: { fields: {
title: { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] }, title: { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] },
text: { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] } 'title.exact': { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] },
text: { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] },
'text.exact': { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] }
} }
} }
} }
@ -487,8 +505,14 @@ export default {
orderBy: 'ORDER BY rank ASC, msats DESC' orderBy: 'ORDER BY rank ASC, msats DESC'
})).map((item, i) => { })).map((item, i) => {
const e = sitems.body.hits.hits[i] const e = sitems.body.hits.hits[i]
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title
item.searchText = (e.highlight?.text && e.highlight.text.join(' ... ')) || undefined // prefer the fuzzier highlight for title
item.searchTitle = e.highlight?.title?.[0] || e.highlight?.['title.exact']?.[0] || item.title
// prefer the exact highlight for text
const searchTextHighlight = [...(e.highlight?.['text.exact'] || []), ...(e.highlight?.text || [])]
item.searchText = searchTextHighlight?.slice(0, 5)?.join(' ... ')
return item return item
}) })

View File

@ -27,13 +27,28 @@ else
"text": { "text": {
"type": "text", "type": "text",
"analyzer": "english", "analyzer": "english",
"fields": {"keyword":{"type":"keyword","ignore_above":256}} "fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"exact": {
"type": "text",
"analyzer": "standard"
}
}
}, },
"title": { "title": {
"type": "text", "type": "text",
"analyzer": "english", "analyzer": "english",
"fields": {"keyword":{"type":"keyword","ignore_above":256}} "fields": {
}}}}' \ "keyword": {"type": "keyword", "ignore_above": 256},
"exact": {
"type": "text",
"analyzer": "standard"
}
}
}
}
}
}' \
"http://localhost:9200/$OPENSEARCH_INDEX" \ "http://localhost:9200/$OPENSEARCH_INDEX" \
-ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
echo "" echo ""

View File

@ -295,12 +295,24 @@ else
"text": { "text": {
"type": "text", "type": "text",
"analyzer": "english", "analyzer": "english",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}} "fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"exact": {
"type": "text",
"analyzer": "standard"
}
}
}, },
"title": { "title": {
"type": "text", "type": "text",
"analyzer": "english", "analyzer": "english",
"fields": {"keyword": {"type": "keyword", "ignore_above": 256}} "fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"exact": {
"type": "text",
"analyzer": "standard"
}
}
}, },
"title_embedding": { "title_embedding": {
"type": "knn_vector", "type": "knn_vector",