SatsAllDay d7ecbbae3a
Search bookmarks (#1075)
* Support `is:bookmarked` search option to search my bookmarked items

* Update the worker search module to include `bookmarkedBy: Array<Number>` which
contains the list of user ids which have bookmarked a given item

* Add a trigger on the `Bookmark` DB table to re-index the corresponding item when
a bookmark is added/removed

* Update the Search resolver to check for a `is:bookmarked` query option. If provided,
include it as an option in the search request. This updates search to look for items
which are bookmarked by the current user. By default, this preserves stacker privacy
so you can only search your own bookmarks

* Update the search page UI to show how to invoke searching your own bookmarks

* undo `is:bookmarked` support, add `bookmarks` item in search select

* short circuit return empty payload for anon requesting bookmarks

* remove console.log for debugging

* fix indexing a new item that has yet to be bookmarked

* update db migration to re-index all existing bookmarked items one time

* fix the case where deleting a bookmark doesn't trigger a new index of items

explictly specify a `updatedAt` value when deleting a bookmark, to ensure that
deleting a bookmark results in a new indexed version of the bookmarked item

* update search indexer to use the latest of all three choices for the latest version

* give bookmark index jobs longer expiration

---------

Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com>
Co-authored-by: keyan <keyan.kousha+huumn@gmail.com>
2024-04-19 13:24:48 -05:00

476 lines
13 KiB
JavaScript

import { decodeCursor, LIMIT, nextCursorEncoded } from '@/lib/cursor'
import { whenToFrom } from '@/lib/time'
import { getItem, itemQueryWithMeta, SELECT } from './item'
function queryParts (q) {
const regex = /"([^"]*)"/gm
const queryArr = q.replace(regex, '').trim().split(/\s+/)
const url = queryArr.find(word => word.startsWith('url:'))
const nym = queryArr.find(word => word.startsWith('@'))
const territory = queryArr.find(word => word.startsWith('~'))
const exclude = [url, nym, territory]
const query = queryArr.filter(word => !exclude.includes(word)).join(' ')
return {
quotes: [...q.matchAll(regex)].map(m => m[1]),
nym,
url,
territory,
query
}
}
export default {
Query: {
related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
const decodedCursor = decodeCursor(cursor)
if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
return {
items: [],
cursor: null
}
}
const like = []
if (id) {
like.push({
_index: process.env.OPENSEARCH_INDEX,
_id: id
})
}
if (title) {
like.push(title)
}
const mustNot = [{ exists: { field: 'parentId' } }]
if (id) {
mustNot.push({ term: { id } })
}
let should = [
{
more_like_this: {
fields: ['title', 'text'],
like,
min_term_freq: 1,
min_doc_freq: 1,
max_doc_freq: 5,
min_word_length: 2,
max_query_terms: 25,
minimum_should_match: minMatch || '10%',
boost_terms: 100
}
}
]
if (process.env.OPENSEARCH_MODEL_ID) {
let qtitle = title
let qtext = title
if (id) {
const item = await getItem(parent, { id }, { me, models })
qtitle = item.title || item.text
qtext = item.text || item.title
}
should = [
{
neural: {
title_embedding: {
query_text: qtext,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: qtitle,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
const results = await search.search({
index: process.env.OPENSEARCH_INDEX,
size: limit,
from: decodedCursor.offset,
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
body: {
query: {
function_score: {
query: {
bool: {
should,
filter: [
{
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
],
must_not: mustNot
}
},
{
range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
}
]
}
},
functions: [{
field_value_factor: {
field: 'wvotes',
modifier: 'none',
factor: 1,
missing: 0
}
}],
boost_mode: 'multiply'
}
}
}
})
const values = results.body.hits.hits.map((e, i) => {
return `(${e._source.id}, ${i})`
}).join(',')
if (values.length === 0) {
return {
cursor: null,
items: []
}
}
const items = await itemQueryWithMeta({
me,
models,
query: `
WITH r(id, rank) AS (VALUES ${values})
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
})
return {
cursor: items.length === (limit || LIMIT) ? nextCursorEncoded(decodedCursor) : null,
items
}
},
search: async (parent, { q, cursor, sort, what, when, from: whenFrom, to: whenTo }, { me, models, search }) => {
const decodedCursor = decodeCursor(cursor)
let sitems = null
let termQueries = []
// short circuit: return empty result if either:
// 1. no query provided, or
// 2. searching bookmarks without being authed
if (!q || (what === 'bookmarks' && !me)) {
return {
items: [],
cursor: null
}
}
const whatArr = []
switch (what) {
case 'posts':
whatArr.push({ bool: { must_not: { exists: { field: 'parentId' } } } })
break
case 'comments':
whatArr.push({ bool: { must: { exists: { field: 'parentId' } } } })
break
case 'bookmarks':
if (me?.id) {
whatArr.push({ match: { bookmarkedBy: me?.id } })
}
break
default:
break
}
const { query: _query, quotes, nym, url, territory } = queryParts(q)
let query = _query
const isUrlSearch = url && query.length === 0 // exclusively searching for an url
if (url) {
const isFQDN = url.startsWith('url:www.')
const domain = isFQDN ? url.slice(8) : url.slice(4)
const fqdn = `www.${domain}`
query = (isUrlSearch) ? `${domain} ${fqdn}` : `${query.trim()} ${domain}`
}
if (nym) {
whatArr.push({ wildcard: { 'user.name': `*${nym.slice(1).toLowerCase()}*` } })
}
if (territory) {
whatArr.push({ match: { 'sub.name': territory.slice(1) } })
}
termQueries.push({
// all terms are matched in fields
multi_match: {
query,
type: 'best_fields',
fields: ['title^100', 'text'],
minimum_should_match: (isUrlSearch) ? 1 : '100%',
boost: 1000
}
})
for (const quote of quotes) {
whatArr.push({
multi_match: {
query: quote,
type: 'phrase',
fields: ['title', 'text']
}
})
}
// if we search for an exact string only, everything must match
// so score purely on sort field
let boostMode = query ? 'multiply' : 'replace'
let sortField
let sortMod = 'log1p'
switch (sort) {
case 'comments':
sortField = 'ncomments'
sortMod = 'square'
break
case 'sats':
sortField = 'sats'
break
case 'recent':
sortField = 'createdAt'
sortMod = 'square'
boostMode = 'replace'
break
default:
sortField = 'wvotes'
sortMod = 'none'
break
}
const functions = [
{
field_value_factor: {
field: sortField,
modifier: sortMod,
factor: 1.2
}
}
]
if (sort === 'recent' && !isUrlSearch) {
// prioritize exact matches
termQueries.push({
multi_match: {
query,
type: 'phrase',
fields: ['title^100', 'text'],
boost: 1000
}
})
} else {
// allow fuzzy matching with partial matches
termQueries.push({
multi_match: {
query,
type: 'most_fields',
fields: ['title^100', 'text'],
fuzziness: 'AUTO',
prefix_length: 3,
minimum_should_match: (isUrlSearch) ? 1 : '60%'
}
})
functions.push({
// small bias toward posts with comments
field_value_factor: {
field: 'ncomments',
modifier: 'ln1p',
factor: 1
}
},
{
// small bias toward recent posts
field_value_factor: {
field: 'createdAt',
modifier: 'log1p',
factor: 1
}
})
}
if (query.length) {
// if we have a model id and we aren't sort by recent, use neural search
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
termQueries = {
hybrid: {
queries: [
{
bool: {
should: [
{
neural: {
title_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
},
{
bool: {
should: termQueries
}
}
]
}
}
}
} else {
termQueries = []
}
const whenRange = when === 'custom'
? {
gte: whenFrom,
lte: new Date(Math.min(new Date(Number(whenTo)), decodedCursor.time))
}
: {
lte: decodedCursor.time,
gte: whenToFrom(when)
}
try {
sitems = await search.search({
index: process.env.OPENSEARCH_INDEX,
size: LIMIT,
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
from: decodedCursor.offset,
body: {
query: {
function_score: {
query: {
bool: {
must: termQueries,
filter: [
...whatArr,
me
? {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } },
{ match: { userId: me.id } }
]
}
}
: {
bool: {
should: [
{ match: { status: 'ACTIVE' } },
{ match: { status: 'NOSATS' } }
]
}
},
{
range:
{
createdAt: whenRange
}
},
{ range: { wvotes: { gte: 0 } } }
]
}
},
functions,
boost_mode: boostMode
}
},
highlight: {
fields: {
title: { number_of_fragments: 0, pre_tags: ['***'], post_tags: ['***'] },
text: { number_of_fragments: 5, order: 'score', pre_tags: ['***'], post_tags: ['***'] }
}
}
}
})
} catch (e) {
console.log(e)
return {
cursor: null,
items: []
}
}
const values = sitems.body.hits.hits.map((e, i) => {
return `(${e._source.id}, ${i})`
}).join(',')
if (values.length === 0) {
return {
cursor: null,
items: []
}
}
const items = (await itemQueryWithMeta({
me,
models,
query: `
WITH r(id, rank) AS (VALUES ${values})
${SELECT}, rank
FROM "Item"
JOIN r ON "Item".id = r.id`,
orderBy: 'ORDER BY rank ASC'
})).map((item, i) => {
const e = sitems.body.hits.hits[i]
item.searchTitle = (e.highlight?.title && e.highlight.title[0]) || item.title
item.searchText = (e.highlight?.text && e.highlight.text.join(' ... ')) || undefined
return item
})
return {
cursor: items.length === LIMIT ? nextCursorEncoded(decodedCursor) : null,
items
}
}
}
}