stacker.news/api/resolvers/search.js

447 lines
12 KiB
JavaScript
Raw Normal View History

2022-10-20 22:44:44 +00:00
import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
import { whenToFrom } from '../../lib/time'
2024-01-17 23:39:39 +00:00
import { getItem, itemQueryWithMeta, SELECT } from './item'
2022-10-20 22:44:44 +00:00
function queryParts (q) {
const regex = /"([^"]*)"/gm
const queryArr = q.replace(regex, '').trim().split(/\s+/)
const url = queryArr.find(word => word.startsWith('url:'))
const nym = queryArr.find(word => word.startsWith('nym:'))
const exclude = [url, nym]
const query = queryArr.filter(word => !exclude.includes(word)).join(' ')
return {
quotes: [...q.matchAll(regex)].map(m => m[1]),
nym,
url,
query
}
}
2022-10-20 22:44:44 +00:00
export default {
Query: {
2024-01-15 23:22:32 +00:00
related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
2022-10-26 22:46:01 +00:00
const decodedCursor = decodeCursor(cursor)
2023-11-11 23:56:20 +00:00
if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
return {
items: [],
cursor: null
2022-10-26 22:46:01 +00:00
}
}
2023-11-11 23:56:20 +00:00
const like = []
if (id) {
like.push({
2024-01-15 23:22:32 +00:00
_index: process.env.OPENSEARCH_INDEX,
2023-11-11 23:56:20 +00:00
_id: id
})
}
if (title) {
like.push(title)
}
2024-01-15 23:22:32 +00:00
const mustNot = [{ exists: { field: 'parentId' } }]
2022-10-26 22:46:01 +00:00
if (id) {
mustNot.push({ term: { id } })
}
2024-01-15 23:22:32 +00:00
let should = [
{
more_like_this: {
fields: ['title', 'text'],
like,
min_term_freq: 1,
min_doc_freq: 1,
max_doc_freq: 5,
min_word_length: 2,
max_query_terms: 25,
2024-01-17 23:39:39 +00:00
minimum_should_match: minMatch || '10%',
boost_terms: 100
2024-01-15 23:22:32 +00:00
}
}
]
if (process.env.OPENSEARCH_MODEL_ID) {
let qtitle = title
let qtext = title
if (id) {
const item = await getItem(parent, { id }, { me, models })
qtitle = item.title || item.text
qtext = item.text || item.title
}
should = [
{
neural: {
title_embedding: {
query_text: qtext,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: qtitle,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
2024-01-17 23:39:39 +00:00
const results = await search.search({
2024-01-15 23:22:32 +00:00
index: process.env.OPENSEARCH_INDEX,
size: limit,
2022-10-26 22:46:01 +00:00
from: decodedCursor.offset,
2024-01-15 23:22:32 +00:00
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
2022-10-26 22:46:01 +00:00
body: {
query: {
2023-11-11 23:56:20 +00:00
function_score: {
query: {
bool: {
2024-01-15 23:22:32 +00:00
should,
filter: [
2023-11-11 23:56:20 +00:00
{
2024-01-15 23:22:32 +00:00
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
],
must_not: mustNot
2023-11-11 23:56:20 +00:00
}
},
{
2024-01-15 23:22:32 +00:00
range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
2023-11-11 23:56:20 +00:00
}
2024-01-15 23:22:32 +00:00
]
2022-10-26 22:46:01 +00:00
}
2023-11-11 23:56:20 +00:00
},
2024-01-15 23:22:32 +00:00
functions: [{
field_value_factor: {
field: 'wvotes',
modifier: 'none',
factor: 1,
missing: 0
}
}],
2023-11-11 23:56:20 +00:00
boost_mode: 'multiply'
2022-10-26 22:46:01 +00:00
}
2023-11-11 23:56:20 +00:00
}
2022-10-26 22:46:01 +00:00
}
})
2024-01-17 23:39:39 +00:00
const values = results.body.hits.hits.map((e, i) => {
return `(${e._source.id}, ${i})`
}).join(',')
const items = await itemQueryWithMeta({
me,
models,
query: `
WITH r(id, rank) AS (VALUES ${values})
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
2022-10-26 22:46:01 +00:00
})
return {
cursor: items.length === (limit || LIMIT) ? nextCursorEncoded(decodedCursor) : null,
items
}
},
search: async (parent, { q, sub, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => {
2022-10-20 22:44:44 +00:00
const decodedCursor = decodeCursor(cursor)
let sitems
if (!q) {
2022-10-25 22:30:54 +00:00
return {
items: [],
cursor: null
}
}
2022-10-24 21:32:16 +00:00
const whatArr = []
switch (what) {
case 'posts':
whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
break
case 'comments':
whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } })
break
default:
break
}
const { query, quotes, nym, url } = queryParts(q)
2022-10-24 21:32:16 +00:00
if (url) {
2023-10-03 00:56:54 +00:00
whatArr.push({ match_phrase_prefix: { url: `${url.slice(4).toLowerCase()}` } })
2022-10-24 21:32:16 +00:00
}
if (nym) {
whatArr.push({ wildcard: { 'user.name': `*${nym.slice(4).toLowerCase()}*` } })
}
2023-05-01 20:58:30 +00:00
if (sub) {
whatArr.push({ match: { 'sub.name': sub } })
2023-05-01 20:58:30 +00:00
}
2024-01-15 23:22:32 +00:00
let termQueries = [
2023-11-14 00:17:45 +00:00
{
// all terms are matched in fields
multi_match: {
query,
2024-01-15 23:22:32 +00:00
type: 'best_fields',
fields: ['title^100', 'text'],
2023-11-14 00:17:45 +00:00
minimum_should_match: '100%',
boost: 1000
}
}
]
for (const quote of quotes) {
whatArr.push({
multi_match: {
query: quote,
type: 'phrase',
fields: ['title', 'text']
}
})
}
// if we search for an exact string only, everything must match
// so score purely on sort field
let boostMode = query ? 'multiply' : 'replace'
2023-11-10 15:22:30 +00:00
let sortField
2023-11-14 00:17:45 +00:00
let sortMod = 'log1p'
2022-10-20 22:44:44 +00:00
switch (sort) {
case 'comments':
2023-11-10 15:22:30 +00:00
sortField = 'ncomments'
2023-11-14 00:17:45 +00:00
sortMod = 'square'
2022-10-20 22:44:44 +00:00
break
case 'sats':
2023-11-10 15:22:30 +00:00
sortField = 'sats'
2022-10-20 22:44:44 +00:00
break
2023-11-14 00:17:45 +00:00
case 'recent':
sortField = 'createdAt'
sortMod = 'square'
boostMode = 'replace'
break
2022-10-20 22:44:44 +00:00
default:
2023-11-10 15:22:30 +00:00
sortField = 'wvotes'
2023-11-14 00:17:45 +00:00
sortMod = 'none'
2022-10-20 22:44:44 +00:00
break
}
2023-11-14 00:17:45 +00:00
const functions = [
{
field_value_factor: {
field: sortField,
modifier: sortMod,
factor: 1.2
}
}
]
2024-01-15 23:22:32 +00:00
if (sort === 'recent') {
// prioritize exact matches
termQueries.push({
multi_match: {
query,
type: 'phrase',
fields: ['title^100', 'text'],
boost: 1000
}
})
} else {
// allow fuzzy matching with partial matches
termQueries.push({
2023-11-14 00:17:45 +00:00
multi_match: {
query,
type: 'most_fields',
2024-01-15 23:22:32 +00:00
fields: ['title^100', 'text'],
2023-11-14 00:17:45 +00:00
fuzziness: 'AUTO',
prefix_length: 3,
minimum_should_match: '60%'
}
})
functions.push({
2024-01-15 23:22:32 +00:00
// small bias toward posts with comments
2023-11-14 00:17:45 +00:00
field_value_factor: {
field: 'ncomments',
modifier: 'ln1p',
factor: 1
}
2024-01-15 23:22:32 +00:00
},
{
// small bias toward recent posts
field_value_factor: {
field: 'createdAt',
modifier: 'log1p',
factor: 1
}
2023-11-14 00:17:45 +00:00
})
}
2022-10-24 21:32:16 +00:00
if (query.length) {
2024-01-15 23:22:32 +00:00
// if we have a model id and we aren't sort by recent, use neural search
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
termQueries = {
hybrid: {
queries: [
{
bool: {
should: [
{
neural: {
title_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
},
{
bool: {
should: termQueries
}
}
]
}
2022-10-24 21:32:16 +00:00
}
2024-01-15 23:22:32 +00:00
}
} else {
termQueries = []
2022-10-20 22:44:44 +00:00
}
const whenRange = when === 'custom'
? {
gte: whenFrom,
lte: new Date(Math.min(new Date(whenTo), decodedCursor.time))
}
: {
lte: decodedCursor.time,
gte: whenToFrom(when)
}
2022-10-20 22:44:44 +00:00
try {
sitems = await search.search({
2024-01-15 23:22:32 +00:00
index: process.env.OPENSEARCH_INDEX,
2022-10-20 22:44:44 +00:00
size: LIMIT,
2024-01-15 23:22:32 +00:00
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
2022-10-20 22:44:44 +00:00
from: decodedCursor.offset,
body: {
query: {
2023-11-10 15:22:30 +00:00
function_score: {
query: {
bool: {
2024-01-15 23:22:32 +00:00
...(sort === 'recent' ? { must: termQueries } : { should: termQueries }),
filter: [
2023-11-10 15:22:30 +00:00
...whatArr,
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
2024-01-15 23:22:32 +00:00
},
2023-11-10 15:22:30 +00:00
{
range:
{
createdAt: whenRange
2022-10-20 22:44:44 +00:00
}
2023-11-10 15:22:30 +00:00
},
{ range: { wvotes: { gte: 0 } } }
]
}
},
2023-11-14 00:17:45 +00:00
functions,
boost_mode: boostMode
2022-10-20 22:44:44 +00:00
}
},
highlight: {
fields: {
title: { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] },
text: { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] }
2022-10-20 22:44:44 +00:00
}
}
}
})
} catch (e) {
console.log(e)
return {
cursor: null,
items: []
}
}
2024-01-17 23:39:39 +00:00
const values = sitems.body.hits.hits.map((e, i) => {
return `(${e._source.id}, ${i})`
}).join(',')
const items = (await itemQueryWithMeta({
me,
models,
query: `
WITH r(id, rank) AS (VALUES ${values})
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
})).map((item, i) => {
const e = sitems.body.hits.hits[i]
2022-10-24 21:32:16 +00:00
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title
item.searchText = (e.highlight?.text && e.highlight.text.join(' ... ')) || undefined
2022-10-20 22:44:44 +00:00
return item
})
return {
cursor: items.length === LIMIT ? nextCursorEncoded(decodedCursor) : null,
items
}
}
}
}