From 9af338835329585b9244a54098cdd666b2964eec Mon Sep 17 00:00:00 2001 From: keyan Date: Mon, 15 Jan 2024 17:22:32 -0600 Subject: [PATCH] semantic search --- .env.sample | 2 + api/resolvers/search.js | 258 +++++++++++++++++++-------------- api/search/index.js | 16 +-- components/related.js | 2 +- docs/semantic-search.md | 308 ++++++++++++++++++++++++++++++++++++++++ worker/search.js | 2 +- 6 files changed, 471 insertions(+), 117 deletions(-) create mode 100644 docs/semantic-search.md diff --git a/.env.sample b/.env.sample index ef53d7fe..79f31442 100644 --- a/.env.sample +++ b/.env.sample @@ -39,6 +39,8 @@ IMGPROXY_SALT= OPENSEARCH_URL=http://opensearch:9200 OPENSEARCH_USERNAME= OPENSEARCH_PASSWORD= +OPENSEARCH_INDEX=item +OPENSEARCH_MODEL_ID= ####################################################### # WALLET / OPTIONAL # diff --git a/api/resolvers/search.js b/api/resolvers/search.js index 10bb166f..ff785e5b 100644 --- a/api/resolvers/search.js +++ b/api/resolvers/search.js @@ -2,23 +2,9 @@ import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor' import { whenToFrom } from '../../lib/time' import { getItem } from './item' -const STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', - 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', - 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', - 'there', 'these', 'they', 'this', 'to', 'was', 'will', - 'with', 'bitcoin', 'page', 'adds', 'how', 'why', 'what', - 'works', 'now', 'available', 'breaking', 'app', 'powered', - 'just', 'dev', 'using', 'crypto', 'has', 'my', 'i', 'apps', - 'really', 'new', 'era', 'application', 'best', 'year', - 'latest', 'still', 'few', 'crypto', 'keep', 'public', 'current', - 'levels', 'from', 'cryptocurrencies', 'confirmed', 'news', 'network', - 'about', 'sources', 'vote', 'considerations', 'hope', - 'keep', 'keeps', 'including', 'we', 'brings', "don't", 'do', - 'interesting', 'us', 'welcome', 'thoughts', 'results'] - export default { Query: { - related: async (parent, { title, id, cursor, limit, minMatch }, { me, models, search }) => { + related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => { const decodedCursor = decodeCursor(cursor) if (!id && (!title || title.trim().split(/\s+/).length < 1)) { @@ -31,7 +17,7 @@ export default { const like = [] if (id) { like.push({ - _index: 'item', + _index: process.env.OPENSEARCH_INDEX, _id: id }) } @@ -40,86 +26,98 @@ export default { like.push(title) } - const mustNot = [] + const mustNot = [{ exists: { field: 'parentId' } }] if (id) { mustNot.push({ term: { id } }) } + let should = [ + { + more_like_this: { + fields: ['title', 'text'], + like, + min_term_freq: 1, + min_doc_freq: 1, + max_doc_freq: 5, + min_word_length: 2, + max_query_terms: 25, + minimum_should_match: minMatch || '10%' + } + } + ] + + if (process.env.OPENSEARCH_MODEL_ID) { + let qtitle = title + let qtext = title + if (id) { + const item = await getItem(parent, { id }, { me, models }) + qtitle = item.title || item.text + qtext = item.text || item.title + } + + should = [ + { + neural: { + title_embedding: { + query_text: qtext, + model_id: process.env.OPENSEARCH_MODEL_ID, + k: decodedCursor.offset + LIMIT + } + } + }, + { + neural: { + text_embedding: { + query_text: qtitle, + model_id: process.env.OPENSEARCH_MODEL_ID, + k: decodedCursor.offset + LIMIT + } + } + } + ] + } + let items = await search.search({ - index: 'item', - size: limit || LIMIT, + index: process.env.OPENSEARCH_INDEX, + size: limit, from: decodedCursor.offset, + _source: { + excludes: [ + 'text', + 'text_embedding', + 'title_embedding' + ] + }, body: { query: { function_score: { query: { bool: { - should: [ + should, + filter: [ { - more_like_this: { - fields: ['title'], - like, - min_term_freq: 1, - min_doc_freq: 1, - min_word_length: 2, - max_query_terms: 12, - minimum_should_match: minMatch || '80%', - stop_words: STOP_WORDS, - boost: 10000 + bool: { + should: [ + { match: { status: 'ACTIVE' } }, + { match: { status: 'NOSATS' } } + ], + must_not: mustNot } }, { - more_like_this: { - fields: ['title'], - like, - min_term_freq: 1, - min_doc_freq: 1, - min_word_length: 2, - max_query_terms: 12, - minimum_should_match: minMatch || '60%', - stop_words: STOP_WORDS, - boost: 1000 - } - }, - { - more_like_this: { - fields: ['title'], - like, - min_term_freq: 1, - min_doc_freq: 1, - min_word_length: 2, - max_query_terms: 12, - minimum_should_match: minMatch || '30%', - stop_words: STOP_WORDS, - boost: 100 - } - }, - { - more_like_this: { - fields: ['text'], - like, - min_term_freq: 1, - min_doc_freq: 1, - min_word_length: 2, - max_query_terms: 25, - minimum_should_match: minMatch || '30%', - stop_words: STOP_WORDS, - boost: 10 - } + range: { wvotes: { gte: minMatch ? 0 : 0.2 } } } - ], - must_not: [{ exists: { field: 'parentId' } }, ...mustNot], - filter: { - range: { wvotes: { gte: minMatch ? 0 : 0.2 } } - } + ] } }, - field_value_factor: { - field: 'wvotes', - modifier: 'log1p', - factor: 1.2, - missing: 0 - }, + functions: [{ + field_value_factor: { + field: 'wvotes', + modifier: 'none', + factor: 1, + missing: 0 + } + }], boost_mode: 'multiply' } } @@ -177,24 +175,14 @@ export default { whatArr.push({ match: { 'sub.name': sub } }) } - const should = [ + let termQueries = [ { // all terms are matched in fields multi_match: { query, - type: 'most_fields', - fields: ['title^1000', 'text'], + type: 'best_fields', + fields: ['title^100', 'text'], minimum_should_match: '100%', - boost: 10000 - } - }, - { - // all terms are matched in fields fuzzily - multi_match: { - query, - type: 'most_fields', - fields: ['title^1000', 'text'], - minimum_should_match: '60%', boost: 1000 } } @@ -232,35 +220,87 @@ export default { } ] - // allow fuzzy matching for single terms - if (sort !== 'recent') { - should.push({ - // only some terms must match unless we're sorting + if (sort === 'recent') { + // prioritize exact matches + termQueries.push({ + multi_match: { + query, + type: 'phrase', + fields: ['title^100', 'text'], + boost: 1000 + } + }) + } else { + // allow fuzzy matching with partial matches + termQueries.push({ multi_match: { query, type: 'most_fields', - fields: ['title^1000', 'text'], + fields: ['title^100', 'text'], fuzziness: 'AUTO', prefix_length: 3, minimum_should_match: '60%' } }) - // small bias toward posts with comments functions.push({ + // small bias toward posts with comments field_value_factor: { field: 'ncomments', modifier: 'ln1p', factor: 1 } + }, + { + // small bias toward recent posts + field_value_factor: { + field: 'createdAt', + modifier: 'log1p', + factor: 1 + } }) } if (query.length) { - whatArr.push({ - bool: { - should + // if we have a model id and we aren't sort by recent, use neural search + if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') { + termQueries = { + hybrid: { + queries: [ + { + bool: { + should: [ + { + neural: { + title_embedding: { + query_text: query, + model_id: process.env.OPENSEARCH_MODEL_ID, + k: decodedCursor.offset + LIMIT + } + } + }, + { + neural: { + text_embedding: { + query_text: query, + model_id: process.env.OPENSEARCH_MODEL_ID, + k: decodedCursor.offset + LIMIT + } + } + } + ] + } + }, + { + bool: { + should: termQueries + } + } + ] + } } - }) + } + } else { + termQueries = [] } const whenRange = when === 'custom' @@ -275,15 +315,23 @@ export default { try { sitems = await search.search({ - index: 'item', + index: process.env.OPENSEARCH_INDEX, size: LIMIT, + _source: { + excludes: [ + 'text', + 'text_embedding', + 'title_embedding' + ] + }, from: decodedCursor.offset, body: { query: { function_score: { query: { bool: { - must: [ + ...(sort === 'recent' ? { must: termQueries } : { should: termQueries }), + filter: [ ...whatArr, me ? { @@ -302,9 +350,7 @@ export default { { match: { status: 'NOSATS' } } ] } - } - ], - filter: [ + }, { range: { diff --git a/api/search/index.js b/api/search/index.js index 1fcc178f..ea2999f0 100644 --- a/api/search/index.js +++ b/api/search/index.js @@ -1,14 +1,12 @@ import os from '@opensearch-project/opensearch' -const options = process.env.NODE_ENV === 'development' - ? { node: process.env.OPENSEARCH_URL || 'http://localhost:9200' } - : { - node: process.env.OPENSEARCH_URL, - auth: { - username: process.env.OPENSEARCH_USERNAME, - password: process.env.OPENSEARCH_PASSWORD - } - } +const options = { + node: process.env.OPENSEARCH_URL, + auth: { + username: process.env.OPENSEARCH_USERNAME, + password: process.env.OPENSEARCH_PASSWORD + } +} global.os = global.os || new os.Client(options) diff --git a/components/related.js b/components/related.js index 2ccff80e..4408f59a 100644 --- a/components/related.js +++ b/components/related.js @@ -9,7 +9,7 @@ export default function Related ({ title, itemId }) { const variables = { title, id: itemId, limit: LIMIT } return ( related} + header={
related posts
} body={ , + "model_format": "TORCH_SCRIPT" +} +``` + +### step 4: wait until the model registration is complete +```json +GET /_plugins/_ml/tasks/ +``` + +### step 5: deploy the model +Note the model id +```json +POST /_plugins/_ml/models//_deploy +``` + +### step 6: create an ingest pipeline +Most models choke on empty strings, so we remove them at an earlier stage in the pipeline. We also add the model to the pipeline which generates the embeddings. + +```json +PUT /_ingest/pipeline/nlp-ingest-pipeline +{ + "description": "An NLP ingest pipeline", + "processors": [ + { + "remove": { + "field": "text", + "if": "ctx?.text?.trim() == ''" + } + }, + { + "remove": { + "field": "title", + "if": "ctx?.title?.trim() == ''" + } + }, + { + "text_embedding": { + "model_id": "6whlBY0B2sj1ObjeeD5d", + "field_map": { + "text": "text_embedding", + "title": "title_embedding" + } + } + } + ] +} +``` + +### step 7: create a new index with the knn_vector type +We'll need to create mappings for the embeddings which is also a convenient time to specifiy special analyzers for the text and title fields. + +```json +PUT /item-nlp +{ + "settings": { + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "title": { + "type": "text", + "analyzer": "english", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + }, + "title_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "engine": "lucene", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + }, + "text_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "engine": "lucene", + "space_type": "l2", + "name": "hnsw", + "parameters": {} + } + } + } + } +} +``` + +### step 8: create a search pipeline for weighting term search and vector search +```json +PUT /_search/pipeline/nlp-search-pipeline +{ + "description": "Pre and post processor for hybrid search", + "request_processors": [ + { + "neural_query_enricher" : { + "description": "Sets the default model ID at index and field levels (which doesn't actually work)", + "default_model_id": , + } + } + ], + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean", + "parameters": { + "weights": [ + 0.7, + 0.3 + ] + } + } + } + } + ] +} +``` + +### step 9: set it as the default search pipeline +```json +PUT /item-nlp/_settings +{ + "index.search.default_pipeline" : "nlp-search-pipeline" +} +``` + +### step 10: reindex your data if you have data +Warning: this take a very very long time. +```json +POST _reindex?wait_for_completion=false +{ + "source": { + "index": "item" + }, + "dest": { + "index": "item-nlp" + } +} +``` + +You can check the status of the reindexing with the following command: +```json +GET _tasks/ +``` + +### step 11: search! +```json +GET /item-nlp/_search +{ + "_source": { + "excludes": [ + "text_embedding", + "title_embedding" + ] + }, + "size": 100, + "function_score": { + "query": { + "hybrid": { + "queries": [ + { + "bool": { + "should": [ + { + "neural": { + "title_embedding": { + "query_text": "etf bitcoin", + "model_id": , + "k": 100 + } + } + }, + { + "neural": { + "text_embedding": { + "query_text": "etf bitcoin", + "model_id": , + "k": 100 + } + } + } + ], + "filter": [ + { + "range": { + "wvotes": { + "gte": 0 + } + } + } + ] + } + }, + { + "bool": { + "should": [ + { + "multi_match": { + "query": "etf bitcoin", + "type": "most_fields", + "fields": [ + "title^1000", + "text" + ], + "minimum_should_match": "100%", + "boost": 10 + } + }, + { + "multi_match": { + "query": "etf bitcoin", + "type": "most_fields", + "fields": [ + "title^1000", + "text" + ], + "minimum_should_match": "60%", + "boost": 1 + } + } + ], + "filter": [ + { + "range": { + "wvotes": { + "gte": 0 + } + } + } + ] + } + } + ] + } + }, + "functions": [ + { + "field_value_factor": { + "field": "wvotes", + "modifier": "none", + "factor": 1.2 + } + }, + { + "field_value_factor": { + "field": "ncomments", + "modifier": "ln1p", + "factor": 1 + } + } + ] + } +} +``` + diff --git a/worker/search.js b/worker/search.js index 55ed3993..fb0a4d5f 100644 --- a/worker/search.js +++ b/worker/search.js @@ -63,7 +63,7 @@ async function _indexItem (item, { models }) { try { await search.index({ id: item.id, - index: 'item', + index: process.env.OPENSEARCH_INDEX, version: new Date(item.updatedAt).getTime(), versionType: 'external_gte', body: itemcp