diff --git a/api/resolvers/search.js b/api/resolvers/search.js index 1068844d..08216a2c 100644 --- a/api/resolvers/search.js +++ b/api/resolvers/search.js @@ -174,7 +174,6 @@ export default { search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => { const decodedCursor = decodeCursor(cursor) let sitems = null - let termQueries = [] // short circuit: return empty result if either: // 1. no query provided, or @@ -186,56 +185,116 @@ export default { } } - const whatArr = [] + // build query in parts: + // filters: determine the universe of potential search candidates + // termQueries: queries related to the actual search terms + // functions: rank modifiers to boost by recency or popularity + const filters = [] + const termQueries = [] + const functions = [] + + // filters for item types switch (what) { - case 'posts': - whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } }) + case 'posts': // posts only + filters.push({ bool: { must_not: { exists: { field: 'parentId' } } } }) break - case 'comments': - whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } }) + case 'comments': // comments only + filters.push({ bool: { must: { exists: { field: 'parentId' } } } }) break case 'bookmarks': if (me?.id) { - whatArr.push({ match: { bookmarkedBy: me?.id } }) + filters.push({ match: { bookmarkedBy: me?.id } }) } break default: break } + // filter for active posts + filters.push( + me + ? { + bool: { + should: [ + { match: { status: 'ACTIVE' } }, + { match: { status: 'NOSATS' } }, + { match: { userId: me.id } } + ] + } + } + : { + bool: { + should: [ + { match: { status: 'ACTIVE' } }, + { match: { status: 'NOSATS' } } + ] + } + } + ) + + // filter for time range + const whenRange = when === 'custom' + ? { + gte: whenFrom, + lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time)) + } + : { + lte: decodedCursor.time, + gte: whenToFrom(when) + } + filters.push({ range: { createdAt: whenRange } }) + + // filter for non negative wvotes + filters.push({ range: { wvotes: { gte: 0 } } }) + + // decompose the search terms const { query: _query, quotes, nym, url, territory } = queryParts(q) - let query = _query - - const isUrlSearch = url && query.length === 0 // exclusively searching for an url + const query = _query + // if search contains a url term, modify the query text if (url) { - const isFQDN = url.startsWith('url:www.') - const domain = isFQDN ? url.slice(8) : url.slice(4) - const fqdn = `www.${domain}` - query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}` - } - - if (nym) { - whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } }) - } - - if (territory) { - whatArr.push({ match: { 'sub.name': territory.slice(1) } }) - } - - termQueries.push({ - // all terms are matched in fields - multi_match: { - query, - type: 'best_fields', - fields: ['title^100', 'text'], - minimum_should_match: (isUrlSearch) ? 1 : '100%', - boost: 1000 + const uri = url.slice(4) + let uriObj + try { + uriObj = new URL(uri) + } catch { + try { + uriObj = new URL(`https://${uri}`) + } catch {} } - }) + if (uriObj) { + termQueries.push({ + wildcard: { url: `*${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}*` } + }) + termQueries.push({ + match: { text: `${uriObj?.hostname ?? uri}${uriObj?.pathname ?? ''}` } + }) + } + } + + // if nym, items must contain nym + if (nym) { + filters.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } }) + } + + // if territory, item must be from territory + if (territory) { + filters.push({ match: { 'sub.name': territory.slice(1) } }) + } + + // if quoted phrases, items must contain entire phrase for (const quote of quotes) { - whatArr.push({ + termQueries.push({ + multi_match: { + query: quote, + type: 'phrase', + fields: ['title', 'text'] + } + }) + + // force the search to include the quoted phrase + filters.push({ multi_match: { query: quote, type: 'phrase', @@ -244,84 +303,45 @@ export default { }) } - // if we search for an exact string only, everything must match - // so score purely on sort field - let boostMode = query ? 'multiply' : 'replace' - let sortField - let sortMod = 'log1p' - switch (sort) { - case 'comments': - sortField = 'ncomments' - sortMod = 'square' - break - case 'sats': - sortField = 'sats' - break - case 'recent': - sortField = 'createdAt' - sortMod = 'square' - boostMode = 'replace' - break - default: - sortField = 'wvotes' - sortMod = 'none' - break - } - - const functions = [ - { - field_value_factor: { - field: sortField, - modifier: sortMod, - factor: 1.2 - } - } - ] - - if (sort === 'recent' && !isUrlSearch) { - // prioritize exact matches - termQueries.push({ - multi_match: { - query, - type: 'phrase', - fields: ['title^100', 'text'], - boost: 1000 - } - }) - } else { - // allow fuzzy matching with partial matches - termQueries.push({ - multi_match: { - query, - type: 'most_fields', - fields: ['title^100', 'text'], - fuzziness: 'AUTO', - prefix_length: 3, - minimum_should_match: (isUrlSearch) ? 1 : '60%' - } - }) - functions.push({ - // small bias toward posts with comments - field_value_factor: { - field: 'ncomments', - modifier: 'ln1p', - factor: 1 - } - }, - { - // small bias toward recent posts - field_value_factor: { - field: 'createdAt', - modifier: 'log1p', - factor: 1 - } - }) - } - + // query for search terms if (query.length) { - // if we have a model id and we aren't sort by recent, use neural search - if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') { - termQueries = { + // keyword based subquery, to be used on its own or in conjunction with a neural + // search + const subquery = [ + { + multi_match: { + query, + type: 'most_fields', + fields: ['title^10', 'text'], + fuzziness: 'AUTO', + minimum_should_match: 1 + } + }, + // all match matches higher + { + multi_match: { + query, + type: 'most_fields', + fields: ['title^10', 'text'], + minimum_should_match: '100%', + boost: 100 + } + }, + // phrase match matches higher + { + multi_match: { + query, + type: 'phrase', + fields: ['title^10', 'text'], + boost: 1000 + } + } + ] + + // use hybrid neural search if model id is available, otherwise use only + // keyword search + if (process.env.OPENSEARCH_MODEL_ID) { + termQueries.push({ hybrid: { queries: [ { @@ -350,26 +370,58 @@ export default { }, { bool: { - should: termQueries + should: subquery, + minimum_should_match: 1 } } ] } - } + }) + } else { + termQueries.push(...subquery) } - } else { - termQueries = [] } - const whenRange = when === 'custom' - ? { - gte: whenFrom, - lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time)) - } - : { - lte: decodedCursor.time, - gte: whenToFrom(when) - } + // functions for boosting search rank by recency or popularity + switch (sort) { + case 'comments': + functions.push({ + field_value_factor: { + field: 'ncomments', + modifier: 'log1p' + } + }) + break + case 'sats': + functions.push({ + field_value_factor: { + field: 'sats', + modifier: 'log1p' + } + }) + break + case 'recent': + functions.push({ + gauss: { + createdAt: { + origin: 'now', + scale: '7d', + decay: 0.5 + } + } + }) + break + case 'zaprank': + functions.push({ + field_value_factor: { + field: 'wvotes', + modifier: 'log1p' + } + }) + break + default: + break + } try { sitems = await search.search({ @@ -388,39 +440,14 @@ export default { function_score: { query: { bool: { - must: termQueries, - filter: [ - ...whatArr, - me - ? { - bool: { - should: [ - { match: { status: 'ACTIVE' } }, - { match: { status: 'NOSATS' } }, - { match: { userId: me.id } } - ] - } - } - : { - bool: { - should: [ - { match: { status: 'ACTIVE' } }, - { match: { status: 'NOSATS' } } - ] - } - }, - { - range: - { - createdAt: whenRange - } - }, - { range: { wvotes: { gte: 0 } } } - ] + filter: filters, + should: termQueries, + minimum_should_match: 1 } }, functions, - boost_mode: boostMode + score_mode: 'multiply', + boost_mode: 'multiply' } }, highlight: { @@ -458,7 +485,7 @@ export default { ${SELECT}, rank FROM "Item" JOIN r ON "Item".id = r.id`, - orderBy: 'ORDER BY rank ASC' + orderBy: 'ORDER BY rank ASC, msats DESC' })).map((item, i) => { const e = sitems.body.hits.hits[i] item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title diff --git a/components/search.js b/components/search.js index b834c019..67c9d23c 100644 --- a/components/search.js +++ b/components/search.js @@ -36,7 +36,7 @@ export default function Search ({ sub }) { } if (values.what === '' || values.what === 'all') delete values.what - if (values.sort === '' || values.sort === 'zaprank') delete values.sort + if (values.sort === '' || values.sort === 'relevance') delete values.sort if (values.when === '' || values.when === 'forever') delete values.when if (values.when !== 'custom') { delete values.from; delete values.to } if (values.from && !values.to) return @@ -50,7 +50,7 @@ export default function Search ({ sub }) { const filter = sub !== 'jobs' const what = router.pathname.startsWith('/stackers') ? 'stackers' : router.query.what || 'all' - const sort = router.query.sort || 'zaprank' + const sort = router.query.sort || 'relevance' const when = router.query.when || 'forever' const whatItemOptions = useMemo(() => (['all', 'posts', 'comments', me ? 'bookmarks' : undefined, 'stackers'].filter(item => !!item)), [me]) @@ -100,7 +100,7 @@ export default function Search ({ sub }) { name='sort' size='sm' overrideValue={sort} - items={['zaprank', 'recent', 'comments', 'sats']} + items={['relevance', 'zaprank', 'recent', 'comments', 'sats']} /> for