Search improvements: Add relevance search and make recent searches less strict (#1962)

* reconfigured search pipeline

* remove console debug messages

* log1p for comments

* improve relevance of non-relevance sorted queries

---------

Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com>
Co-authored-by: k00b <k00b@stacker.news>
This commit is contained in:
Edward Kung 2025-03-17 17:25:20 -07:00 committed by GitHub
parent 1a52ff7784
commit b71398a06c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 185 additions and 158 deletions

View File

@ -174,7 +174,6 @@ export default {
search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => { search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => {
const decodedCursor = decodeCursor(cursor) const decodedCursor = decodeCursor(cursor)
let sitems = null let sitems = null
let termQueries = []
// short circuit: return empty result if either: // short circuit: return empty result if either:
// 1. no query provided, or // 1. no query provided, or
@ -186,56 +185,116 @@ export default {
} }
} }
const whatArr = [] // build query in parts:
// filters: determine the universe of potential search candidates
// termQueries: queries related to the actual search terms
// functions: rank modifiers to boost by recency or popularity
const filters = []
const termQueries = []
const functions = []
// filters for item types
switch (what) { switch (what) {
case 'posts': case 'posts': // posts only
whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } }) filters.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
break break
case 'comments': case 'comments': // comments only
whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } }) filters.push({ bool: { must: { exists: { field: 'parentId' } } } })
break break
case 'bookmarks': case 'bookmarks':
if (me?.id) { if (me?.id) {
whatArr.push({ match: { bookmarkedBy: me?.id } }) filters.push({ match: { bookmarkedBy: me?.id } })
} }
break break
default: default:
break break
} }
// filter for active posts
filters.push(
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
}
)
// filter for time range
const whenRange = when === 'custom'
? {
gte: whenFrom,
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
}
: {
lte: decodedCursor.time,
gte: whenToFrom(when)
}
filters.push({ range: { createdAt: whenRange } })
// filter for non negative wvotes
filters.push({ range: { wvotes: { gte: 0 } } })
// decompose the search terms
const { query: _query, quotes, nym, url, territory } = queryParts(q) const { query: _query, quotes, nym, url, territory } = queryParts(q)
let query = _query const query = _query
const isUrlSearch = url && query.length === 0 // exclusively searching for an url
// if search contains a url term, modify the query text
if (url) { if (url) {
const isFQDN = url.startsWith('url:www.') const uri = url.slice(4)
const domain = isFQDN ? url.slice(8) : url.slice(4) let uriObj
const fqdn = `www.${domain}` try {
query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}` uriObj = new URL(uri)
} } catch {
try {
if (nym) { uriObj = new URL(`https://${uri}`)
whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } }) } catch {}
}
if (territory) {
whatArr.push({ match: { 'sub.name': territory.slice(1) } })
} }
if (uriObj) {
termQueries.push({
wildcard: { url: `*${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}*` }
})
termQueries.push({
match: { text: `${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}` }
})
}
}
// if nym, items must contain nym
if (nym) {
filters.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
}
// if territory, item must be from territory
if (territory) {
filters.push({ match: { 'sub.name': territory.slice(1) } })
}
// if quoted phrases, items must contain entire phrase
for (const quote of quotes) {
termQueries.push({ termQueries.push({
// all terms are matched in fields
multi_match: { multi_match: {
query, query: quote,
type: 'best_fields', type: 'phrase',
fields: ['title^100', 'text'], fields: ['title', 'text']
minimum_should_match: (isUrlSearch) ? 1 : '100%',
boost: 1000
} }
}) })
for (const quote of quotes) { // force the search to include the quoted phrase
whatArr.push({ filters.push({
multi_match: { multi_match: {
query: quote, query: quote,
type: 'phrase', type: 'phrase',
@ -244,84 +303,45 @@ export default {
}) })
} }
// if we search for an exact string only, everything must match // query for search terms
// so score purely on sort field if (query.length) {
let boostMode = query ? 'multiply' : 'replace' // keyword based subquery, to be used on its own or in conjunction with a neural
let sortField // search
let sortMod = 'log1p' const subquery = [
switch (sort) {
case 'comments':
sortField = 'ncomments'
sortMod = 'square'
break
case 'sats':
sortField = 'sats'
break
case 'recent':
sortField = 'createdAt'
sortMod = 'square'
boostMode = 'replace'
break
default:
sortField = 'wvotes'
sortMod = 'none'
break
}
const functions = [
{ {
field_value_factor: { multi_match: {
field: sortField, query,
modifier: sortMod, type: 'most_fields',
factor: 1.2 fields: ['title^10', 'text'],
fuzziness: 'AUTO',
minimum_should_match: 1
}
},
// all match matches higher
{
multi_match: {
query,
type: 'most_fields',
fields: ['title^10', 'text'],
minimum_should_match: '100%',
boost: 100
}
},
// phrase match matches higher
{
multi_match: {
query,
type: 'phrase',
fields: ['title^10', 'text'],
boost: 1000
} }
} }
] ]
if (sort === 'recent' && !isUrlSearch) { // use hybrid neural search if model id is available, otherwise use only
// prioritize exact matches // keyword search
if (process.env.OPENSEARCH_MODEL_ID) {
termQueries.push({ termQueries.push({
multi_match: {
query,
type: 'phrase',
fields: ['title^100', 'text'],
boost: 1000
}
})
} else {
// allow fuzzy matching with partial matches
termQueries.push({
multi_match: {
query,
type: 'most_fields',
fields: ['title^100', 'text'],
fuzziness: 'AUTO',
prefix_length: 3,
minimum_should_match: (isUrlSearch) ? 1 : '60%'
}
})
functions.push({
// small bias toward posts with comments
field_value_factor: {
field: 'ncomments',
modifier: 'ln1p',
factor: 1
}
},
{
// small bias toward recent posts
field_value_factor: {
field: 'createdAt',
modifier: 'log1p',
factor: 1
}
})
}
if (query.length) {
// if we have a model id and we aren't sort by recent, use neural search
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
termQueries = {
hybrid: { hybrid: {
queries: [ queries: [
{ {
@ -350,25 +370,57 @@ export default {
}, },
{ {
bool: { bool: {
should: termQueries should: subquery,
minimum_should_match: 1
} }
} }
] ]
} }
} })
}
} else { } else {
termQueries = [] termQueries.push(...subquery)
}
} }
const whenRange = when === 'custom' // functions for boosting search rank by recency or popularity
? { switch (sort) {
gte: whenFrom, case 'comments':
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time)) functions.push({
field_value_factor: {
field: 'ncomments',
modifier: 'log1p'
} }
: { })
lte: decodedCursor.time, break
gte: whenToFrom(when) case 'sats':
functions.push({
field_value_factor: {
field: 'sats',
modifier: 'log1p'
}
})
break
case 'recent':
functions.push({
gauss: {
createdAt: {
origin: 'now',
scale: '7d',
decay: 0.5
}
}
})
break
case 'zaprank':
functions.push({
field_value_factor: {
field: 'wvotes',
modifier: 'log1p'
}
})
break
default:
break
} }
try { try {
@ -388,39 +440,14 @@ export default {
function_score: { function_score: {
query: { query: {
bool: { bool: {
must: termQueries, filter: filters,
filter: [ should: termQueries,
...whatArr, minimum_should_match: 1
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
},
{
range:
{
createdAt: whenRange
}
},
{ range: { wvotes: { gte: 0 } } }
]
} }
}, },
functions, functions,
boost_mode: boostMode score_mode: 'multiply',
boost_mode: 'multiply'
} }
}, },
highlight: { highlight: {
@ -458,7 +485,7 @@ export default {
${SELECT}, rank ${SELECT}, rank
FROM "Item" FROM "Item"
JOIN r ON "Item".id = r.id`, JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC' orderBy: 'ORDER BY rank ASC, msats DESC'
})).map((item, i) => { })).map((item, i) => {
const e = sitems.body.hits.hits[i] const e = sitems.body.hits.hits[i]
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title

View File

@ -36,7 +36,7 @@ export default function Search ({ sub }) {
} }
if (values.what === '' || values.what === 'all') delete values.what if (values.what === '' || values.what === 'all') delete values.what
if (values.sort === '' || values.sort === 'zaprank') delete values.sort if (values.sort === '' || values.sort === 'relevance') delete values.sort
if (values.when === '' || values.when === 'forever') delete values.when if (values.when === '' || values.when === 'forever') delete values.when
if (values.when !== 'custom') { delete values.from; delete values.to } if (values.when !== 'custom') { delete values.from; delete values.to }
if (values.from && !values.to) return if (values.from && !values.to) return
@ -50,7 +50,7 @@ export default function Search ({ sub }) {
const filter = sub !== 'jobs' const filter = sub !== 'jobs'
const what = router.pathname.startsWith('/stackers') ? 'stackers' : router.query.what || 'all' const what = router.pathname.startsWith('/stackers') ? 'stackers' : router.query.what || 'all'
const sort = router.query.sort || 'zaprank' const sort = router.query.sort || 'relevance'
const when = router.query.when || 'forever' const when = router.query.when || 'forever'
const whatItemOptions = useMemo(() => (['all', 'posts', 'comments', me ? 'bookmarks' : undefined, 'stackers'].filter(item => !!item)), [me]) const whatItemOptions = useMemo(() => (['all', 'posts', 'comments', me ? 'bookmarks' : undefined, 'stackers'].filter(item => !!item)), [me])
@ -100,7 +100,7 @@ export default function Search ({ sub }) {
name='sort' name='sort'
size='sm' size='sm'
overrideValue={sort} overrideValue={sort}
items={['zaprank', 'recent', 'comments', 'sats']} items={['relevance', 'zaprank', 'recent', 'comments', 'sats']}
/> />
for for
<Select <Select