Search improvements: Add relevance search and make recent searches less strict (#1962)

* reconfigured search pipeline

* remove console debug messages

* log1p for comments

* improve relevance of non-relevance sorted queries

---------

Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com>
Co-authored-by: k00b <k00b@stacker.news>
This commit is contained in:
Edward Kung 2025-03-17 17:25:20 -07:00 committed by GitHub
parent 1a52ff7784
commit b71398a06c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 185 additions and 158 deletions

View File

@ -174,7 +174,6 @@ export default {
search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => {
const decodedCursor = decodeCursor(cursor)
let sitems = null
let termQueries = []
// short circuit: return empty result if either:
// 1. no query provided, or
@ -186,56 +185,116 @@ export default {
}
}
const whatArr = []
// build query in parts:
// filters: determine the universe of potential search candidates
// termQueries: queries related to the actual search terms
// functions: rank modifiers to boost by recency or popularity
const filters = []
const termQueries = []
const functions = []
// filters for item types
switch (what) {
case 'posts':
whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
case 'posts': // posts only
filters.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
break
case 'comments':
whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } })
case 'comments': // comments only
filters.push({ bool: { must: { exists: { field: 'parentId' } } } })
break
case 'bookmarks':
if (me?.id) {
whatArr.push({ match: { bookmarkedBy: me?.id } })
filters.push({ match: { bookmarkedBy: me?.id } })
}
break
default:
break
}
// filter for active posts
filters.push(
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
}
)
// filter for time range
const whenRange = when === 'custom'
? {
gte: whenFrom,
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
}
: {
lte: decodedCursor.time,
gte: whenToFrom(when)
}
filters.push({ range: { createdAt: whenRange } })
// filter for non negative wvotes
filters.push({ range: { wvotes: { gte: 0 } } })
// decompose the search terms
const { query: _query, quotes, nym, url, territory } = queryParts(q)
let query = _query
const isUrlSearch = url && query.length === 0 // exclusively searching for an url
const query = _query
// if search contains a url term, modify the query text
if (url) {
const isFQDN = url.startsWith('url:www.')
const domain = isFQDN ? url.slice(8) : url.slice(4)
const fqdn = `www.${domain}`
query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}`
}
if (nym) {
whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
}
if (territory) {
whatArr.push({ match: { 'sub.name': territory.slice(1) } })
}
termQueries.push({
// all terms are matched in fields
multi_match: {
query,
type: 'best_fields',
fields: ['title^100', 'text'],
minimum_should_match: (isUrlSearch) ? 1 : '100%',
boost: 1000
const uri = url.slice(4)
let uriObj
try {
uriObj = new URL(uri)
} catch {
try {
uriObj = new URL(`https://${uri}`)
} catch {}
}
})
if (uriObj) {
termQueries.push({
wildcard: { url: `*${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}*` }
})
termQueries.push({
match: { text: `${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}` }
})
}
}
// if nym, items must contain nym
if (nym) {
filters.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
}
// if territory, item must be from territory
if (territory) {
filters.push({ match: { 'sub.name': territory.slice(1) } })
}
// if quoted phrases, items must contain entire phrase
for (const quote of quotes) {
whatArr.push({
termQueries.push({
multi_match: {
query: quote,
type: 'phrase',
fields: ['title', 'text']
}
})
// force the search to include the quoted phrase
filters.push({
multi_match: {
query: quote,
type: 'phrase',
@ -244,84 +303,45 @@ export default {
})
}
// if we search for an exact string only, everything must match
// so score purely on sort field
let boostMode = query ? 'multiply' : 'replace'
let sortField
let sortMod = 'log1p'
switch (sort) {
case 'comments':
sortField = 'ncomments'
sortMod = 'square'
break
case 'sats':
sortField = 'sats'
break
case 'recent':
sortField = 'createdAt'
sortMod = 'square'
boostMode = 'replace'
break
default:
sortField = 'wvotes'
sortMod = 'none'
break
}
const functions = [
{
field_value_factor: {
field: sortField,
modifier: sortMod,
factor: 1.2
}
}
]
if (sort === 'recent' && !isUrlSearch) {
// prioritize exact matches
termQueries.push({
multi_match: {
query,
type: 'phrase',
fields: ['title^100', 'text'],
boost: 1000
}
})
} else {
// allow fuzzy matching with partial matches
termQueries.push({
multi_match: {
query,
type: 'most_fields',
fields: ['title^100', 'text'],
fuzziness: 'AUTO',
prefix_length: 3,
minimum_should_match: (isUrlSearch) ? 1 : '60%'
}
})
functions.push({
// small bias toward posts with comments
field_value_factor: {
field: 'ncomments',
modifier: 'ln1p',
factor: 1
}
},
{
// small bias toward recent posts
field_value_factor: {
field: 'createdAt',
modifier: 'log1p',
factor: 1
}
})
}
// query for search terms
if (query.length) {
// if we have a model id and we aren't sort by recent, use neural search
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
termQueries = {
// keyword based subquery, to be used on its own or in conjunction with a neural
// search
const subquery = [
{
multi_match: {
query,
type: 'most_fields',
fields: ['title^10', 'text'],
fuzziness: 'AUTO',
minimum_should_match: 1
}
},
// all match matches higher
{
multi_match: {
query,
type: 'most_fields',
fields: ['title^10', 'text'],
minimum_should_match: '100%',
boost: 100
}
},
// phrase match matches higher
{
multi_match: {
query,
type: 'phrase',
fields: ['title^10', 'text'],
boost: 1000
}
}
]
// use hybrid neural search if model id is available, otherwise use only
// keyword search
if (process.env.OPENSEARCH_MODEL_ID) {
termQueries.push({
hybrid: {
queries: [
{
@ -350,26 +370,58 @@ export default {
},
{
bool: {
should: termQueries
should: subquery,
minimum_should_match: 1
}
}
]
}
}
})
} else {
termQueries.push(...subquery)
}
} else {
termQueries = []
}
const whenRange = when === 'custom'
? {
gte: whenFrom,
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
}
: {
lte: decodedCursor.time,
gte: whenToFrom(when)
}
// functions for boosting search rank by recency or popularity
switch (sort) {
case 'comments':
functions.push({
field_value_factor: {
field: 'ncomments',
modifier: 'log1p'
}
})
break
case 'sats':
functions.push({
field_value_factor: {
field: 'sats',
modifier: 'log1p'
}
})
break
case 'recent':
functions.push({
gauss: {
createdAt: {
origin: 'now',
scale: '7d',
decay: 0.5
}
}
})
break
case 'zaprank':
functions.push({
field_value_factor: {
field: 'wvotes',
modifier: 'log1p'
}
})
break
default:
break
}
try {
sitems = await search.search({
@ -388,39 +440,14 @@ export default {
function_score: {
query: {
bool: {
must: termQueries,
filter: [
...whatArr,
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
},
{
range:
{
createdAt: whenRange
}
},
{ range: { wvotes: { gte: 0 } } }
]
filter: filters,
should: termQueries,
minimum_should_match: 1
}
},
functions,
boost_mode: boostMode
score_mode: 'multiply',
boost_mode: 'multiply'
}
},
highlight: {
@ -458,7 +485,7 @@ export default {
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
orderBy: 'ORDER BY rank ASC, msats DESC'
})).map((item, i) => {
const e = sitems.body.hits.hits[i]
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title

View File

@ -36,7 +36,7 @@ export default function Search ({ sub }) {
}
if (values.what === '' || values.what === 'all') delete values.what
if (values.sort === '' || values.sort === 'zaprank') delete values.sort
if (values.sort === '' || values.sort === 'relevance') delete values.sort
if (values.when === '' || values.when === 'forever') delete values.when
if (values.when !== 'custom') { delete values.from; delete values.to }
if (values.from && !values.to) return
@ -50,7 +50,7 @@ export default function Search ({ sub }) {
const filter = sub !== 'jobs'
const what = router.pathname.startsWith('/stackers') ? 'stackers' : router.query.what || 'all'
const sort = router.query.sort || 'zaprank'
const sort = router.query.sort || 'relevance'
const when = router.query.when || 'forever'
const whatItemOptions = useMemo(() => (['all', 'posts', 'comments', me ? 'bookmarks' : undefined, 'stackers'].filter(item => !!item)), [me])
@ -100,7 +100,7 @@ export default function Search ({ sub }) {
name='sort'
size='sm'
overrideValue={sort}
items={['zaprank', 'recent', 'comments', 'sats']}
items={['relevance', 'zaprank', 'recent', 'comments', 'sats']}
/>
for
<Select