Search improvements: Add relevance search and make recent searches less strict (#1962)
* reconfigured search pipeline * remove console debug messages * log1p for comments * improve relevance of non-relevance sorted queries --------- Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com> Co-authored-by: k00b <k00b@stacker.news>
This commit is contained in:
parent
1a52ff7784
commit
b71398a06c
@ -174,7 +174,6 @@ export default {
|
||||
search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => {
|
||||
const decodedCursor = decodeCursor(cursor)
|
||||
let sitems = null
|
||||
let termQueries = []
|
||||
|
||||
// short circuit: return empty result if either:
|
||||
// 1. no query provided, or
|
||||
@ -186,56 +185,116 @@ export default {
|
||||
}
|
||||
}
|
||||
|
||||
const whatArr = []
|
||||
// build query in parts:
|
||||
// filters: determine the universe of potential search candidates
|
||||
// termQueries: queries related to the actual search terms
|
||||
// functions: rank modifiers to boost by recency or popularity
|
||||
const filters = []
|
||||
const termQueries = []
|
||||
const functions = []
|
||||
|
||||
// filters for item types
|
||||
switch (what) {
|
||||
case 'posts':
|
||||
whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
|
||||
case 'posts': // posts only
|
||||
filters.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
|
||||
break
|
||||
case 'comments':
|
||||
whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } })
|
||||
case 'comments': // comments only
|
||||
filters.push({ bool: { must: { exists: { field: 'parentId' } } } })
|
||||
break
|
||||
case 'bookmarks':
|
||||
if (me?.id) {
|
||||
whatArr.push({ match: { bookmarkedBy: me?.id } })
|
||||
filters.push({ match: { bookmarkedBy: me?.id } })
|
||||
}
|
||||
break
|
||||
default:
|
||||
break
|
||||
}
|
||||
|
||||
// filter for active posts
|
||||
filters.push(
|
||||
me
|
||||
? {
|
||||
bool: {
|
||||
should: [
|
||||
{ match: { status: 'ACTIVE' } },
|
||||
{ match: { status: 'NOSATS' } },
|
||||
{ match: { userId: me.id } }
|
||||
]
|
||||
}
|
||||
}
|
||||
: {
|
||||
bool: {
|
||||
should: [
|
||||
{ match: { status: 'ACTIVE' } },
|
||||
{ match: { status: 'NOSATS' } }
|
||||
]
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
// filter for time range
|
||||
const whenRange = when === 'custom'
|
||||
? {
|
||||
gte: whenFrom,
|
||||
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
|
||||
}
|
||||
: {
|
||||
lte: decodedCursor.time,
|
||||
gte: whenToFrom(when)
|
||||
}
|
||||
filters.push({ range: { createdAt: whenRange } })
|
||||
|
||||
// filter for non negative wvotes
|
||||
filters.push({ range: { wvotes: { gte: 0 } } })
|
||||
|
||||
// decompose the search terms
|
||||
const { query: _query, quotes, nym, url, territory } = queryParts(q)
|
||||
let query = _query
|
||||
|
||||
const isUrlSearch = url && query.length === 0 // exclusively searching for an url
|
||||
const query = _query
|
||||
|
||||
// if search contains a url term, modify the query text
|
||||
if (url) {
|
||||
const isFQDN = url.startsWith('url:www.')
|
||||
const domain = isFQDN ? url.slice(8) : url.slice(4)
|
||||
const fqdn = `www.${domain}`
|
||||
query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}`
|
||||
}
|
||||
|
||||
if (nym) {
|
||||
whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
|
||||
}
|
||||
|
||||
if (territory) {
|
||||
whatArr.push({ match: { 'sub.name': territory.slice(1) } })
|
||||
}
|
||||
|
||||
termQueries.push({
|
||||
// all terms are matched in fields
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'best_fields',
|
||||
fields: ['title^100', 'text'],
|
||||
minimum_should_match: (isUrlSearch) ? 1 : '100%',
|
||||
boost: 1000
|
||||
const uri = url.slice(4)
|
||||
let uriObj
|
||||
try {
|
||||
uriObj = new URL(uri)
|
||||
} catch {
|
||||
try {
|
||||
uriObj = new URL(`https://${uri}`)
|
||||
} catch {}
|
||||
}
|
||||
})
|
||||
|
||||
if (uriObj) {
|
||||
termQueries.push({
|
||||
wildcard: { url: `*${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}*` }
|
||||
})
|
||||
termQueries.push({
|
||||
match: { text: `${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}` }
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// if nym, items must contain nym
|
||||
if (nym) {
|
||||
filters.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
|
||||
}
|
||||
|
||||
// if territory, item must be from territory
|
||||
if (territory) {
|
||||
filters.push({ match: { 'sub.name': territory.slice(1) } })
|
||||
}
|
||||
|
||||
// if quoted phrases, items must contain entire phrase
|
||||
for (const quote of quotes) {
|
||||
whatArr.push({
|
||||
termQueries.push({
|
||||
multi_match: {
|
||||
query: quote,
|
||||
type: 'phrase',
|
||||
fields: ['title', 'text']
|
||||
}
|
||||
})
|
||||
|
||||
// force the search to include the quoted phrase
|
||||
filters.push({
|
||||
multi_match: {
|
||||
query: quote,
|
||||
type: 'phrase',
|
||||
@ -244,84 +303,45 @@ export default {
|
||||
})
|
||||
}
|
||||
|
||||
// if we search for an exact string only, everything must match
|
||||
// so score purely on sort field
|
||||
let boostMode = query ? 'multiply' : 'replace'
|
||||
let sortField
|
||||
let sortMod = 'log1p'
|
||||
switch (sort) {
|
||||
case 'comments':
|
||||
sortField = 'ncomments'
|
||||
sortMod = 'square'
|
||||
break
|
||||
case 'sats':
|
||||
sortField = 'sats'
|
||||
break
|
||||
case 'recent':
|
||||
sortField = 'createdAt'
|
||||
sortMod = 'square'
|
||||
boostMode = 'replace'
|
||||
break
|
||||
default:
|
||||
sortField = 'wvotes'
|
||||
sortMod = 'none'
|
||||
break
|
||||
}
|
||||
|
||||
const functions = [
|
||||
{
|
||||
field_value_factor: {
|
||||
field: sortField,
|
||||
modifier: sortMod,
|
||||
factor: 1.2
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
if (sort === 'recent' && !isUrlSearch) {
|
||||
// prioritize exact matches
|
||||
termQueries.push({
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'phrase',
|
||||
fields: ['title^100', 'text'],
|
||||
boost: 1000
|
||||
}
|
||||
})
|
||||
} else {
|
||||
// allow fuzzy matching with partial matches
|
||||
termQueries.push({
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'most_fields',
|
||||
fields: ['title^100', 'text'],
|
||||
fuzziness: 'AUTO',
|
||||
prefix_length: 3,
|
||||
minimum_should_match: (isUrlSearch) ? 1 : '60%'
|
||||
}
|
||||
})
|
||||
functions.push({
|
||||
// small bias toward posts with comments
|
||||
field_value_factor: {
|
||||
field: 'ncomments',
|
||||
modifier: 'ln1p',
|
||||
factor: 1
|
||||
}
|
||||
},
|
||||
{
|
||||
// small bias toward recent posts
|
||||
field_value_factor: {
|
||||
field: 'createdAt',
|
||||
modifier: 'log1p',
|
||||
factor: 1
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// query for search terms
|
||||
if (query.length) {
|
||||
// if we have a model id and we aren't sort by recent, use neural search
|
||||
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
|
||||
termQueries = {
|
||||
// keyword based subquery, to be used on its own or in conjunction with a neural
|
||||
// search
|
||||
const subquery = [
|
||||
{
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'most_fields',
|
||||
fields: ['title^10', 'text'],
|
||||
fuzziness: 'AUTO',
|
||||
minimum_should_match: 1
|
||||
}
|
||||
},
|
||||
// all match matches higher
|
||||
{
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'most_fields',
|
||||
fields: ['title^10', 'text'],
|
||||
minimum_should_match: '100%',
|
||||
boost: 100
|
||||
}
|
||||
},
|
||||
// phrase match matches higher
|
||||
{
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'phrase',
|
||||
fields: ['title^10', 'text'],
|
||||
boost: 1000
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
// use hybrid neural search if model id is available, otherwise use only
|
||||
// keyword search
|
||||
if (process.env.OPENSEARCH_MODEL_ID) {
|
||||
termQueries.push({
|
||||
hybrid: {
|
||||
queries: [
|
||||
{
|
||||
@ -350,26 +370,58 @@ export default {
|
||||
},
|
||||
{
|
||||
bool: {
|
||||
should: termQueries
|
||||
should: subquery,
|
||||
minimum_should_match: 1
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
})
|
||||
} else {
|
||||
termQueries.push(...subquery)
|
||||
}
|
||||
} else {
|
||||
termQueries = []
|
||||
}
|
||||
|
||||
const whenRange = when === 'custom'
|
||||
? {
|
||||
gte: whenFrom,
|
||||
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
|
||||
}
|
||||
: {
|
||||
lte: decodedCursor.time,
|
||||
gte: whenToFrom(when)
|
||||
}
|
||||
// functions for boosting search rank by recency or popularity
|
||||
switch (sort) {
|
||||
case 'comments':
|
||||
functions.push({
|
||||
field_value_factor: {
|
||||
field: 'ncomments',
|
||||
modifier: 'log1p'
|
||||
}
|
||||
})
|
||||
break
|
||||
case 'sats':
|
||||
functions.push({
|
||||
field_value_factor: {
|
||||
field: 'sats',
|
||||
modifier: 'log1p'
|
||||
}
|
||||
})
|
||||
break
|
||||
case 'recent':
|
||||
functions.push({
|
||||
gauss: {
|
||||
createdAt: {
|
||||
origin: 'now',
|
||||
scale: '7d',
|
||||
decay: 0.5
|
||||
}
|
||||
}
|
||||
})
|
||||
break
|
||||
case 'zaprank':
|
||||
functions.push({
|
||||
field_value_factor: {
|
||||
field: 'wvotes',
|
||||
modifier: 'log1p'
|
||||
}
|
||||
})
|
||||
break
|
||||
default:
|
||||
break
|
||||
}
|
||||
|
||||
try {
|
||||
sitems = await search.search({
|
||||
@ -388,39 +440,14 @@ export default {
|
||||
function_score: {
|
||||
query: {
|
||||
bool: {
|
||||
must: termQueries,
|
||||
filter: [
|
||||
...whatArr,
|
||||
me
|
||||
? {
|
||||
bool: {
|
||||
should: [
|
||||
{ match: { status: 'ACTIVE' } },
|
||||
{ match: { status: 'NOSATS' } },
|
||||
{ match: { userId: me.id } }
|
||||
]
|
||||
}
|
||||
}
|
||||
: {
|
||||
bool: {
|
||||
should: [
|
||||
{ match: { status: 'ACTIVE' } },
|
||||
{ match: { status: 'NOSATS' } }
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
range:
|
||||
{
|
||||
createdAt: whenRange
|
||||
}
|
||||
},
|
||||
{ range: { wvotes: { gte: 0 } } }
|
||||
]
|
||||
filter: filters,
|
||||
should: termQueries,
|
||||
minimum_should_match: 1
|
||||
}
|
||||
},
|
||||
functions,
|
||||
boost_mode: boostMode
|
||||
score_mode: 'multiply',
|
||||
boost_mode: 'multiply'
|
||||
}
|
||||
},
|
||||
highlight: {
|
||||
@ -458,7 +485,7 @@ export default {
|
||||
${SELECT}, rank
|
||||
FROM "Item"
|
||||
JOIN r ON "Item".id = r.id`,
|
||||
orderBy: 'ORDER BY rank ASC'
|
||||
orderBy: 'ORDER BY rank ASC, msats DESC'
|
||||
})).map((item, i) => {
|
||||
const e = sitems.body.hits.hits[i]
|
||||
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title
|
||||
|
@ -36,7 +36,7 @@ export default function Search ({ sub }) {
|
||||
}
|
||||
|
||||
if (values.what === '' || values.what === 'all') delete values.what
|
||||
if (values.sort === '' || values.sort === 'zaprank') delete values.sort
|
||||
if (values.sort === '' || values.sort === 'relevance') delete values.sort
|
||||
if (values.when === '' || values.when === 'forever') delete values.when
|
||||
if (values.when !== 'custom') { delete values.from; delete values.to }
|
||||
if (values.from && !values.to) return
|
||||
@ -50,7 +50,7 @@ export default function Search ({ sub }) {
|
||||
|
||||
const filter = sub !== 'jobs'
|
||||
const what = router.pathname.startsWith('/stackers') ? 'stackers' : router.query.what || 'all'
|
||||
const sort = router.query.sort || 'zaprank'
|
||||
const sort = router.query.sort || 'relevance'
|
||||
const when = router.query.when || 'forever'
|
||||
const whatItemOptions = useMemo(() => (['all', 'posts', 'comments', me ? 'bookmarks' : undefined, 'stackers'].filter(item => !!item)), [me])
|
||||
|
||||
@ -100,7 +100,7 @@ export default function Search ({ sub }) {
|
||||
name='sort'
|
||||
size='sm'
|
||||
overrideValue={sort}
|
||||
items={['zaprank', 'recent', 'comments', 'sats']}
|
||||
items={['relevance', 'zaprank', 'recent', 'comments', 'sats']}
|
||||
/>
|
||||
for
|
||||
<Select
|
||||
|
Loading…
x
Reference in New Issue
Block a user