stacker.news/api/resolvers/search.js

476 lines
13 KiB
JavaScript
Raw Normal View History

import { decodeCursor, LIMIT, nextCursorEncoded } from '@/lib/cursor'
import { whenToFrom } from '@/lib/time'
2024-01-17 23:39:39 +00:00
import { getItem, itemQueryWithMeta, SELECT } from './item'
2022-10-20 22:44:44 +00:00
function queryParts (q) {
const regex = /"([^"]*)"/gm
const queryArr = q.replace(regex, '').trim().split(/\s+/)
const url = queryArr.find(word => word.startsWith('url:'))
const nym = queryArr.find(word => word.startsWith('@'))
const territory = queryArr.find(word => word.startsWith('~'))
const exclude = [url, nym, territory]
const query = queryArr.filter(word => !exclude.includes(word)).join(' ')
return {
quotes: [...q.matchAll(regex)].map(m => m[1]),
nym,
url,
territory,
query
}
}
2022-10-20 22:44:44 +00:00
export default {
Query: {
2024-01-15 23:22:32 +00:00
related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
2022-10-26 22:46:01 +00:00
const decodedCursor = decodeCursor(cursor)
2023-11-11 23:56:20 +00:00
if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
return {
items: [],
cursor: null
2022-10-26 22:46:01 +00:00
}
}
2023-11-11 23:56:20 +00:00
const like = []
if (id) {
like.push({
2024-01-15 23:22:32 +00:00
_index: process.env.OPENSEARCH_INDEX,
2023-11-11 23:56:20 +00:00
_id: id
})
}
if (title) {
like.push(title)
}
2024-01-15 23:22:32 +00:00
const mustNot = [{ exists: { field: 'parentId' } }]
2022-10-26 22:46:01 +00:00
if (id) {
mustNot.push({ term: { id } })
}
2024-01-15 23:22:32 +00:00
let should = [
{
more_like_this: {
fields: ['title', 'text'],
like,
min_term_freq: 1,
min_doc_freq: 1,
max_doc_freq: 5,
min_word_length: 2,
max_query_terms: 25,
2024-01-17 23:39:39 +00:00
minimum_should_match: minMatch || '10%',
boost_terms: 100
2024-01-15 23:22:32 +00:00
}
}
]
if (process.env.OPENSEARCH_MODEL_ID) {
let qtitle = title
let qtext = title
if (id) {
const item = await getItem(parent, { id }, { me, models })
qtitle = item.title || item.text
qtext = item.text || item.title
}
should = [
{
neural: {
title_embedding: {
query_text: qtext,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: qtitle,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
2024-01-17 23:39:39 +00:00
const results = await search.search({
2024-01-15 23:22:32 +00:00
index: process.env.OPENSEARCH_INDEX,
size: limit,
2022-10-26 22:46:01 +00:00
from: decodedCursor.offset,
2024-01-15 23:22:32 +00:00
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
2022-10-26 22:46:01 +00:00
body: {
query: {
2023-11-11 23:56:20 +00:00
function_score: {
query: {
bool: {
2024-01-15 23:22:32 +00:00
should,
filter: [
2023-11-11 23:56:20 +00:00
{
2024-01-15 23:22:32 +00:00
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
],
must_not: mustNot
2023-11-11 23:56:20 +00:00
}
},
{
2024-01-15 23:22:32 +00:00
range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
2023-11-11 23:56:20 +00:00
}
2024-01-15 23:22:32 +00:00
]
2022-10-26 22:46:01 +00:00
}
2023-11-11 23:56:20 +00:00
},
2024-01-15 23:22:32 +00:00
functions: [{
field_value_factor: {
field: 'wvotes',
modifier: 'none',
factor: 1,
missing: 0
}
}],
2023-11-11 23:56:20 +00:00
boost_mode: 'multiply'
2022-10-26 22:46:01 +00:00
}
2023-11-11 23:56:20 +00:00
}
2022-10-26 22:46:01 +00:00
}
})
2024-01-17 23:39:39 +00:00
const values = results.body.hits.hits.map((e, i) => {
return `(${e._source.id}, ${i})`
}).join(',')
2024-01-19 21:12:47 +00:00
if (values.length === 0) {
return {
cursor: null,
items: []
}
}
2024-01-17 23:39:39 +00:00
const items = await itemQueryWithMeta({
me,
models,
query: `
WITH r(id, rank) AS (VALUES ${values})
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
2022-10-26 22:46:01 +00:00
})
return {
cursor: items.length === (limit || LIMIT) ? nextCursorEncoded(decodedCursor) : null,
items
}
},
search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => {
2022-10-20 22:44:44 +00:00
const decodedCursor = decodeCursor(cursor)
let sitems = null
let termQueries = []
2022-10-20 22:44:44 +00:00
Search bookmarks (#1075) * Support `is:bookmarked` search option to search my bookmarked items * Update the worker search module to include `bookmarkedBy: Array<Number>` which contains the list of user ids which have bookmarked a given item * Add a trigger on the `Bookmark` DB table to re-index the corresponding item when a bookmark is added/removed * Update the Search resolver to check for a `is:bookmarked` query option. If provided, include it as an option in the search request. This updates search to look for items which are bookmarked by the current user. By default, this preserves stacker privacy so you can only search your own bookmarks * Update the search page UI to show how to invoke searching your own bookmarks * undo `is:bookmarked` support, add `bookmarks` item in search select * short circuit return empty payload for anon requesting bookmarks * remove console.log for debugging * fix indexing a new item that has yet to be bookmarked * update db migration to re-index all existing bookmarked items one time * fix the case where deleting a bookmark doesn't trigger a new index of items explictly specify a `updatedAt` value when deleting a bookmark, to ensure that deleting a bookmark results in a new indexed version of the bookmarked item * update search indexer to use the latest of all three choices for the latest version * give bookmark index jobs longer expiration --------- Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com> Co-authored-by: keyan <keyan.kousha+huumn@gmail.com>
2024-04-19 18:24:48 +00:00
// short circuit: return empty result if either:
// 1. no query provided, or
// 2. searching bookmarks without being authed
if (!q || (what === 'bookmarks' && !me)) {
2022-10-25 22:30:54 +00:00
return {
items: [],
cursor: null
}
}
2022-10-24 21:32:16 +00:00
const whatArr = []
switch (what) {
case 'posts':
whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
break
case 'comments':
whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } })
break
Search bookmarks (#1075) * Support `is:bookmarked` search option to search my bookmarked items * Update the worker search module to include `bookmarkedBy: Array<Number>` which contains the list of user ids which have bookmarked a given item * Add a trigger on the `Bookmark` DB table to re-index the corresponding item when a bookmark is added/removed * Update the Search resolver to check for a `is:bookmarked` query option. If provided, include it as an option in the search request. This updates search to look for items which are bookmarked by the current user. By default, this preserves stacker privacy so you can only search your own bookmarks * Update the search page UI to show how to invoke searching your own bookmarks * undo `is:bookmarked` support, add `bookmarks` item in search select * short circuit return empty payload for anon requesting bookmarks * remove console.log for debugging * fix indexing a new item that has yet to be bookmarked * update db migration to re-index all existing bookmarked items one time * fix the case where deleting a bookmark doesn't trigger a new index of items explictly specify a `updatedAt` value when deleting a bookmark, to ensure that deleting a bookmark results in a new indexed version of the bookmarked item * update search indexer to use the latest of all three choices for the latest version * give bookmark index jobs longer expiration --------- Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com> Co-authored-by: keyan <keyan.kousha+huumn@gmail.com>
2024-04-19 18:24:48 +00:00
case 'bookmarks':
if (me?.id) {
whatArr.push({ match: { bookmarkedBy: me?.id } })
}
break
2022-10-24 21:32:16 +00:00
default:
break
}
const { query: _query, quotes, nym, url, territory } = queryParts(q)
let query = _query
const isUrlSearch = url && query.length === 0 // exclusively searching for an url
2022-10-24 21:32:16 +00:00
if (url) {
const isFQDN = url.startsWith('url:www.')
const domain = isFQDN ? url.slice(8) : url.slice(4)
const fqdn = `www.${domain}`
query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}`
2022-10-24 21:32:16 +00:00
}
if (nym) {
whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
2022-10-24 21:32:16 +00:00
}
if (territory) {
whatArr.push({ match: { 'sub.name': territory.slice(1) } })
2023-05-01 20:58:30 +00:00
}
termQueries.push({
// all terms are matched in fields
multi_match: {
query,
type: 'best_fields',
fields: ['title^100', 'text'],
minimum_should_match: (isUrlSearch) ? 1 : '100%',
boost: 1000
2023-11-14 00:17:45 +00:00
}
})
2023-11-14 00:17:45 +00:00
for (const quote of quotes) {
whatArr.push({
multi_match: {
query: quote,
type: 'phrase',
fields: ['title', 'text']
}
})
}
// if we search for an exact string only, everything must match
// so score purely on sort field
let boostMode = query ? 'multiply' : 'replace'
2023-11-10 15:22:30 +00:00
let sortField
2023-11-14 00:17:45 +00:00
let sortMod = 'log1p'
2022-10-20 22:44:44 +00:00
switch (sort) {
case 'comments':
2023-11-10 15:22:30 +00:00
sortField = 'ncomments'
2023-11-14 00:17:45 +00:00
sortMod = 'square'
2022-10-20 22:44:44 +00:00
break
case 'sats':
2023-11-10 15:22:30 +00:00
sortField = 'sats'
2022-10-20 22:44:44 +00:00
break
2023-11-14 00:17:45 +00:00
case 'recent':
sortField = 'createdAt'
sortMod = 'square'
boostMode = 'replace'
break
2022-10-20 22:44:44 +00:00
default:
2023-11-10 15:22:30 +00:00
sortField = 'wvotes'
2023-11-14 00:17:45 +00:00
sortMod = 'none'
2022-10-20 22:44:44 +00:00
break
}
2023-11-14 00:17:45 +00:00
const functions = [
{
field_value_factor: {
field: sortField,
modifier: sortMod,
factor: 1.2
}
}
]
if (sort === 'recent' && !isUrlSearch) {
2024-01-15 23:22:32 +00:00
// prioritize exact matches
termQueries.push({
multi_match: {
query,
type: 'phrase',
fields: ['title^100', 'text'],
boost: 1000
}
})
} else {
// allow fuzzy matching with partial matches
termQueries.push({
2023-11-14 00:17:45 +00:00
multi_match: {
query,
type: 'most_fields',
2024-01-15 23:22:32 +00:00
fields: ['title^100', 'text'],
2023-11-14 00:17:45 +00:00
fuzziness: 'AUTO',
prefix_length: 3,
minimum_should_match: (isUrlSearch) ? 1 : '60%'
2023-11-14 00:17:45 +00:00
}
})
functions.push({
2024-01-15 23:22:32 +00:00
// small bias toward posts with comments
2023-11-14 00:17:45 +00:00
field_value_factor: {
field: 'ncomments',
modifier: 'ln1p',
factor: 1
}
2024-01-15 23:22:32 +00:00
},
{
// small bias toward recent posts
field_value_factor: {
field: 'createdAt',
modifier: 'log1p',
factor: 1
}
2023-11-14 00:17:45 +00:00
})
}
2022-10-24 21:32:16 +00:00
if (query.length) {
2024-01-15 23:22:32 +00:00
// if we have a model id and we aren't sort by recent, use neural search
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
termQueries = {
hybrid: {
queries: [
{
bool: {
should: [
{
neural: {
title_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
},
{
bool: {
should: termQueries
}
}
]
}
2022-10-24 21:32:16 +00:00
}
2024-01-15 23:22:32 +00:00
}
} else {
termQueries = []
2022-10-20 22:44:44 +00:00
}
const whenRange = when === 'custom'
? {
gte: whenFrom,
2024-01-19 21:12:47 +00:00
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
}
: {
lte: decodedCursor.time,
gte: whenToFrom(when)
}
2022-10-20 22:44:44 +00:00
try {
sitems = await search.search({
2024-01-15 23:22:32 +00:00
index: process.env.OPENSEARCH_INDEX,
2022-10-20 22:44:44 +00:00
size: LIMIT,
2024-01-15 23:22:32 +00:00
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
2022-10-20 22:44:44 +00:00
from: decodedCursor.offset,
body: {
query: {
2023-11-10 15:22:30 +00:00
function_score: {
query: {
bool: {
2024-01-19 21:12:47 +00:00
must: termQueries,
2024-01-15 23:22:32 +00:00
filter: [
2023-11-10 15:22:30 +00:00
...whatArr,
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
2024-01-15 23:22:32 +00:00
},
2023-11-10 15:22:30 +00:00
{
range:
{
createdAt: whenRange
2022-10-20 22:44:44 +00:00
}
2023-11-10 15:22:30 +00:00
},
{ range: { wvotes: { gte: 0 } } }
]
}
},
2023-11-14 00:17:45 +00:00
functions,
boost_mode: boostMode
2022-10-20 22:44:44 +00:00
}
},
highlight: {
fields: {
title: { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] },
text: { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] }
2022-10-20 22:44:44 +00:00
}
}
}
})
} catch (e) {
console.log(e)
return {
cursor: null,
items: []
}
}
2024-01-17 23:39:39 +00:00
const values = sitems.body.hits.hits.map((e, i) => {
return `(${e._source.id}, ${i})`
}).join(',')
2024-01-19 21:12:47 +00:00
if (values.length === 0) {
return {
cursor: null,
items: []
}
}
2024-01-17 23:39:39 +00:00
const items = (await itemQueryWithMeta({
me,
models,
query: `
WITH r(id, rank) AS (VALUES ${values})
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
})).map((item, i) => {
const e = sitems.body.hits.hits[i]
2022-10-24 21:32:16 +00:00
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title
item.searchText = (e.highlight?.text && e.highlight.text.join(' ... ')) || undefined
2022-10-20 22:44:44 +00:00
return item
})
return {
cursor: items.length === LIMIT ? nextCursorEncoded(decodedCursor) : null,
items
}
}
}
}