import { decodeCursor, LIMIT, nextCursorEncoded } from '@/lib/cursor' import { whenToFrom } from '@/lib/time' import { getItem, itemQueryWithMeta, SELECT } from './item' function queryParts (q) { const regex = /"([^"]*)"/gm const queryArr = q.replace(regex, '').trim().split(/\s+/) const url = queryArr.find(word => word.startsWith('url:')) const nym = queryArr.find(word => word.startsWith('@')) const territory = queryArr.find(word => word.startsWith('~')) const exclude = [url, nym, territory] const query = queryArr.filter(word => !exclude.includes(word)).join(' ') return { quotes: [...q.matchAll(regex)].map(m => m[1]), nym, url, territory, query } } export default { Query: { related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => { const decodedCursor = decodeCursor(cursor) if (!id && (!title || title.trim().split(/\s+/).length < 1)) { return { items: [], cursor: null } } const like = [] if (id) { like.push({ _index: process.env.OPENSEARCH_INDEX, _id: id }) } if (title) { like.push(title) } const mustNot = [{ exists: { field: 'parentId' } }] if (id) { mustNot.push({ term: { id } }) } let should = [ { more_like_this: { fields: ['title', 'text'], like, min_term_freq: 1, min_doc_freq: 1, max_doc_freq: 5, min_word_length: 2, max_query_terms: 25, minimum_should_match: minMatch || '10%', boost_terms: 100 } } ] if (process.env.OPENSEARCH_MODEL_ID) { let qtitle = title let qtext = title if (id) { const item = await getItem(parent, { id }, { me, models }) qtitle = item.title || item.text qtext = item.text || item.title } should = [ { neural: { title_embedding: { query_text: qtext, model_id: process.env.OPENSEARCH_MODEL_ID, k: decodedCursor.offset + LIMIT } } }, { neural: { text_embedding: { query_text: qtitle, model_id: process.env.OPENSEARCH_MODEL_ID, k: decodedCursor.offset + LIMIT } } } ] } const results = await search.search({ index: process.env.OPENSEARCH_INDEX, size: limit, from: decodedCursor.offset, _source: { excludes: [ 'text', 'text_embedding', 'title_embedding' ] }, body: { query: { function_score: { query: { bool: { should, filter: [ { bool: { should: [ { match: { status: 'ACTIVE' } }, { match: { status: 'NOSATS' } } ], must_not: mustNot } }, { range: { wvotes: { gte: minMatch ? 0 : 0.2 } } } ] } }, functions: [{ field_value_factor: { field: 'wvotes', modifier: 'none', factor: 1, missing: 0 } }], boost_mode: 'multiply' } } } }) const values = results.body.hits.hits.map((e, i) => { return `(${e._source.id}, ${i})` }).join(',') if (values.length === 0) { return { cursor: null, items: [] } } const items = await itemQueryWithMeta({ me, models, query: ` WITH r(id, rank) AS (VALUES ${values}) ${SELECT}, rank FROM "Item" JOIN r ON "Item".id = r.id`, orderBy: 'ORDER BY rank ASC' }) return { cursor: items.length === (limit || LIMIT) ? nextCursorEncoded(decodedCursor) : null, items } }, search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => { const decodedCursor = decodeCursor(cursor) let sitems = null let termQueries = [] // short circuit: return empty result if either: // 1. no query provided, or // 2. searching bookmarks without being authed if (!q || (what === 'bookmarks' && !me)) { return { items: [], cursor: null } } const whatArr = [] switch (what) { case 'posts': whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } }) break case 'comments': whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } }) break case 'bookmarks': if (me?.id) { whatArr.push({ match: { bookmarkedBy: me?.id } }) } break default: break } const { query: _query, quotes, nym, url, territory } = queryParts(q) let query = _query const isUrlSearch = url && query.length === 0 // exclusively searching for an url if (url) { const isFQDN = url.startsWith('url:www.') const domain = isFQDN ? url.slice(8) : url.slice(4) const fqdn = `www.${domain}` query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}` } if (nym) { whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } }) } if (territory) { whatArr.push({ match: { 'sub.name': territory.slice(1) } }) } termQueries.push({ // all terms are matched in fields multi_match: { query, type: 'best_fields', fields: ['title^100', 'text'], minimum_should_match: (isUrlSearch) ? 1 : '100%', boost: 1000 } }) for (const quote of quotes) { whatArr.push({ multi_match: { query: quote, type: 'phrase', fields: ['title', 'text'] } }) } // if we search for an exact string only, everything must match // so score purely on sort field let boostMode = query ? 'multiply' : 'replace' let sortField let sortMod = 'log1p' switch (sort) { case 'comments': sortField = 'ncomments' sortMod = 'square' break case 'sats': sortField = 'sats' break case 'recent': sortField = 'createdAt' sortMod = 'square' boostMode = 'replace' break default: sortField = 'wvotes' sortMod = 'none' break } const functions = [ { field_value_factor: { field: sortField, modifier: sortMod, factor: 1.2 } } ] if (sort === 'recent' && !isUrlSearch) { // prioritize exact matches termQueries.push({ multi_match: { query, type: 'phrase', fields: ['title^100', 'text'], boost: 1000 } }) } else { // allow fuzzy matching with partial matches termQueries.push({ multi_match: { query, type: 'most_fields', fields: ['title^100', 'text'], fuzziness: 'AUTO', prefix_length: 3, minimum_should_match: (isUrlSearch) ? 1 : '60%' } }) functions.push({ // small bias toward posts with comments field_value_factor: { field: 'ncomments', modifier: 'ln1p', factor: 1 } }, { // small bias toward recent posts field_value_factor: { field: 'createdAt', modifier: 'log1p', factor: 1 } }) } if (query.length) { // if we have a model id and we aren't sort by recent, use neural search if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') { termQueries = { hybrid: { queries: [ { bool: { should: [ { neural: { title_embedding: { query_text: query, model_id: process.env.OPENSEARCH_MODEL_ID, k: decodedCursor.offset + LIMIT } } }, { neural: { text_embedding: { query_text: query, model_id: process.env.OPENSEARCH_MODEL_ID, k: decodedCursor.offset + LIMIT } } } ] } }, { bool: { should: termQueries } } ] } } } } else { termQueries = [] } const whenRange = when === 'custom' ? { gte: whenFrom, lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time)) } : { lte: decodedCursor.time, gte: whenToFrom(when) } try { sitems = await search.search({ index: process.env.OPENSEARCH_INDEX, size: LIMIT, _source: { excludes: [ 'text', 'text_embedding', 'title_embedding' ] }, from: decodedCursor.offset, body: { query: { function_score: { query: { bool: { must: termQueries, filter: [ ...whatArr, me ? { bool: { should: [ { match: { status: 'ACTIVE' } }, { match: { status: 'NOSATS' } }, { match: { userId: me.id } } ] } } : { bool: { should: [ { match: { status: 'ACTIVE' } }, { match: { status: 'NOSATS' } } ] } }, { range: { createdAt: whenRange } }, { range: { wvotes: { gte: 0 } } } ] } }, functions, boost_mode: boostMode } }, highlight: { fields: { title: { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] }, text: { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] } } } } }) } catch (e) { console.log(e) return { cursor: null, items: [] } } const values = sitems.body.hits.hits.map((e, i) => { return `(${e._source.id}, ${i})` }).join(',') if (values.length === 0) { return { cursor: null, items: [] } } const items = (await itemQueryWithMeta({ me, models, query: ` WITH r(id, rank) AS (VALUES ${values}) ${SELECT}, rank FROM "Item" JOIN r ON "Item".id = r.id`, orderBy: 'ORDER BY rank ASC' })).map((item, i) => { const e = sitems.body.hits.hits[i] item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title item.searchText = (e.highlight?.text && e.highlight.text.join(' ... ')) || undefined return item }) return { cursor: items.length === LIMIT ? nextCursorEncoded(decodedCursor) : null, items } } } }