semantic search

This commit is contained in:
keyan 2024-01-15 17:22:32 -06:00
parent 75bf4aced9
commit 9af3388353
6 changed files with 471 additions and 117 deletions

View File

@ -39,6 +39,8 @@ IMGPROXY_SALT=
OPENSEARCH_URL=http://opensearch:9200 OPENSEARCH_URL=http://opensearch:9200
OPENSEARCH_USERNAME= OPENSEARCH_USERNAME=
OPENSEARCH_PASSWORD= OPENSEARCH_PASSWORD=
OPENSEARCH_INDEX=item
OPENSEARCH_MODEL_ID=
####################################################### #######################################################
# WALLET / OPTIONAL # # WALLET / OPTIONAL #

View File

@ -2,23 +2,9 @@ import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
import { whenToFrom } from '../../lib/time' import { whenToFrom } from '../../lib/time'
import { getItem } from './item' import { getItem } from './item'
const STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but',
'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not',
'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then',
'there', 'these', 'they', 'this', 'to', 'was', 'will',
'with', 'bitcoin', 'page', 'adds', 'how', 'why', 'what',
'works', 'now', 'available', 'breaking', 'app', 'powered',
'just', 'dev', 'using', 'crypto', 'has', 'my', 'i', 'apps',
'really', 'new', 'era', 'application', 'best', 'year',
'latest', 'still', 'few', 'crypto', 'keep', 'public', 'current',
'levels', 'from', 'cryptocurrencies', 'confirmed', 'news', 'network',
'about', 'sources', 'vote', 'considerations', 'hope',
'keep', 'keeps', 'including', 'we', 'brings', "don't", 'do',
'interesting', 'us', 'welcome', 'thoughts', 'results']
export default { export default {
Query: { Query: {
related: async (parent, { title, id, cursor, limit, minMatch }, { me, models, search }) => { related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
const decodedCursor = decodeCursor(cursor) const decodedCursor = decodeCursor(cursor)
if (!id && (!title || title.trim().split(/\s+/).length < 1)) { if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
@ -31,7 +17,7 @@ export default {
const like = [] const like = []
if (id) { if (id) {
like.push({ like.push({
_index: 'item', _index: process.env.OPENSEARCH_INDEX,
_id: id _id: id
}) })
} }
@ -40,86 +26,98 @@ export default {
like.push(title) like.push(title)
} }
const mustNot = [] const mustNot = [{ exists: { field: 'parentId' } }]
if (id) { if (id) {
mustNot.push({ term: { id } }) mustNot.push({ term: { id } })
} }
let should = [
{
more_like_this: {
fields: ['title', 'text'],
like,
min_term_freq: 1,
min_doc_freq: 1,
max_doc_freq: 5,
min_word_length: 2,
max_query_terms: 25,
minimum_should_match: minMatch || '10%'
}
}
]
if (process.env.OPENSEARCH_MODEL_ID) {
let qtitle = title
let qtext = title
if (id) {
const item = await getItem(parent, { id }, { me, models })
qtitle = item.title || item.text
qtext = item.text || item.title
}
should = [
{
neural: {
title_embedding: {
query_text: qtext,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
},
{
neural: {
text_embedding: {
query_text: qtitle,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
let items = await search.search({ let items = await search.search({
index: 'item', index: process.env.OPENSEARCH_INDEX,
size: limit || LIMIT, size: limit,
from: decodedCursor.offset, from: decodedCursor.offset,
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
body: { body: {
query: { query: {
function_score: { function_score: {
query: { query: {
bool: {
should,
filter: [
{
bool: { bool: {
should: [ should: [
{ { match: { status: 'ACTIVE' } },
more_like_this: { { match: { status: 'NOSATS' } }
fields: ['title'],
like,
min_term_freq: 1,
min_doc_freq: 1,
min_word_length: 2,
max_query_terms: 12,
minimum_should_match: minMatch || '80%',
stop_words: STOP_WORDS,
boost: 10000
}
},
{
more_like_this: {
fields: ['title'],
like,
min_term_freq: 1,
min_doc_freq: 1,
min_word_length: 2,
max_query_terms: 12,
minimum_should_match: minMatch || '60%',
stop_words: STOP_WORDS,
boost: 1000
}
},
{
more_like_this: {
fields: ['title'],
like,
min_term_freq: 1,
min_doc_freq: 1,
min_word_length: 2,
max_query_terms: 12,
minimum_should_match: minMatch || '30%',
stop_words: STOP_WORDS,
boost: 100
}
},
{
more_like_this: {
fields: ['text'],
like,
min_term_freq: 1,
min_doc_freq: 1,
min_word_length: 2,
max_query_terms: 25,
minimum_should_match: minMatch || '30%',
stop_words: STOP_WORDS,
boost: 10
}
}
], ],
must_not: [{ exists: { field: 'parentId' } }, ...mustNot], must_not: mustNot
filter: { }
},
{
range: { wvotes: { gte: minMatch ? 0 : 0.2 } } range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
} }
]
} }
}, },
functions: [{
field_value_factor: { field_value_factor: {
field: 'wvotes', field: 'wvotes',
modifier: 'log1p', modifier: 'none',
factor: 1.2, factor: 1,
missing: 0 missing: 0
}, }
}],
boost_mode: 'multiply' boost_mode: 'multiply'
} }
} }
@ -177,24 +175,14 @@ export default {
whatArr.push({ match: { 'sub.name': sub } }) whatArr.push({ match: { 'sub.name': sub } })
} }
const should = [ let termQueries = [
{ {
// all terms are matched in fields // all terms are matched in fields
multi_match: { multi_match: {
query, query,
type: 'most_fields', type: 'best_fields',
fields: ['title^1000', 'text'], fields: ['title^100', 'text'],
minimum_should_match: '100%', minimum_should_match: '100%',
boost: 10000
}
},
{
// all terms are matched in fields fuzzily
multi_match: {
query,
type: 'most_fields',
fields: ['title^1000', 'text'],
minimum_should_match: '60%',
boost: 1000 boost: 1000
} }
} }
@ -232,35 +220,87 @@ export default {
} }
] ]
// allow fuzzy matching for single terms if (sort === 'recent') {
if (sort !== 'recent') { // prioritize exact matches
should.push({ termQueries.push({
// only some terms must match unless we're sorting multi_match: {
query,
type: 'phrase',
fields: ['title^100', 'text'],
boost: 1000
}
})
} else {
// allow fuzzy matching with partial matches
termQueries.push({
multi_match: { multi_match: {
query, query,
type: 'most_fields', type: 'most_fields',
fields: ['title^1000', 'text'], fields: ['title^100', 'text'],
fuzziness: 'AUTO', fuzziness: 'AUTO',
prefix_length: 3, prefix_length: 3,
minimum_should_match: '60%' minimum_should_match: '60%'
} }
}) })
// small bias toward posts with comments
functions.push({ functions.push({
// small bias toward posts with comments
field_value_factor: { field_value_factor: {
field: 'ncomments', field: 'ncomments',
modifier: 'ln1p', modifier: 'ln1p',
factor: 1 factor: 1
} }
},
{
// small bias toward recent posts
field_value_factor: {
field: 'createdAt',
modifier: 'log1p',
factor: 1
}
}) })
} }
if (query.length) { if (query.length) {
whatArr.push({ // if we have a model id and we aren't sort by recent, use neural search
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
termQueries = {
hybrid: {
queries: [
{
bool: { bool: {
should should: [
{
neural: {
title_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
} }
}) }
},
{
neural: {
text_embedding: {
query_text: query,
model_id: process.env.OPENSEARCH_MODEL_ID,
k: decodedCursor.offset + LIMIT
}
}
}
]
}
},
{
bool: {
should: termQueries
}
}
]
}
}
}
} else {
termQueries = []
} }
const whenRange = when === 'custom' const whenRange = when === 'custom'
@ -275,15 +315,23 @@ export default {
try { try {
sitems = await search.search({ sitems = await search.search({
index: 'item', index: process.env.OPENSEARCH_INDEX,
size: LIMIT, size: LIMIT,
_source: {
excludes: [
'text',
'text_embedding',
'title_embedding'
]
},
from: decodedCursor.offset, from: decodedCursor.offset,
body: { body: {
query: { query: {
function_score: { function_score: {
query: { query: {
bool: { bool: {
must: [ ...(sort === 'recent' ? { must: termQueries } : { should: termQueries }),
filter: [
...whatArr, ...whatArr,
me me
? { ? {
@ -302,9 +350,7 @@ export default {
{ match: { status: 'NOSATS' } } { match: { status: 'NOSATS' } }
] ]
} }
} },
],
filter: [
{ {
range: range:
{ {

View File

@ -1,8 +1,6 @@
import os from '@opensearch-project/opensearch' import os from '@opensearch-project/opensearch'
const options = process.env.NODE_ENV === 'development' const options = {
? { node: process.env.OPENSEARCH_URL || 'http://localhost:9200' }
: {
node: process.env.OPENSEARCH_URL, node: process.env.OPENSEARCH_URL,
auth: { auth: {
username: process.env.OPENSEARCH_USERNAME, username: process.env.OPENSEARCH_USERNAME,

View File

@ -9,7 +9,7 @@ export default function Related ({ title, itemId }) {
const variables = { title, id: itemId, limit: LIMIT } const variables = { title, id: itemId, limit: LIMIT }
return ( return (
<AccordianItem <AccordianItem
header={<div className='fw-bold'>related</div>} header={<div className='fw-bold'>related posts</div>}
body={ body={
<Items <Items
query={RELATED_ITEMS} query={RELATED_ITEMS}

308
docs/semantic-search.md Normal file
View File

@ -0,0 +1,308 @@
Getting semantic search setup in OpenSearch is a multistep process.
### step 1: configure the ml plugin
```json
PUT _cluster/settings
{
"persistent": {
"plugins.ml_commons.only_run_on_ml_node": "false",
"plugins.ml_commons.model_access_control_enabled": "true",
"plugins.ml_commons.native_memory_threshold": "99"
}
}
```
### step 2: create a model group
```json
POST /_plugins/_ml/model_groups/_register
{
"name": "local_model_group",
"description": "A model group for local models"
}
```
### step 3: register a pretained model to the model group
Importantly, we need to use a model that truncates input. Note the feature number of the model you're using, because we'll need to store those features. For example, the model below has 768 features.
```json
POST /_plugins/_ml/models/_register
{
"name": "huggingface/sentence-transformers/all-mpnet-base-v2",
"version": "1.0.1",
"model_group_id": <model group id>,
"model_format": "TORCH_SCRIPT"
}
```
### step 4: wait until the model registration is complete
```json
GET /_plugins/_ml/tasks/<task id from above>
```
### step 5: deploy the model
Note the model id
```json
POST /_plugins/_ml/models/<model id>/_deploy
```
### step 6: create an ingest pipeline
Most models choke on empty strings, so we remove them at an earlier stage in the pipeline. We also add the model to the pipeline which generates the embeddings.
```json
PUT /_ingest/pipeline/nlp-ingest-pipeline
{
"description": "An NLP ingest pipeline",
"processors": [
{
"remove": {
"field": "text",
"if": "ctx?.text?.trim() == ''"
}
},
{
"remove": {
"field": "title",
"if": "ctx?.title?.trim() == ''"
}
},
{
"text_embedding": {
"model_id": "6whlBY0B2sj1ObjeeD5d",
"field_map": {
"text": "text_embedding",
"title": "title_embedding"
}
}
}
]
}
```
### step 7: create a new index with the knn_vector type
We'll need to create mappings for the embeddings which is also a convenient time to specifiy special analyzers for the text and title fields.
```json
PUT /item-nlp
{
"settings": {
"index.knn": true,
"default_pipeline": "nlp-ingest-pipeline"
},
"mappings": {
"properties": {
"text": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"title_embedding": {
"type": "knn_vector",
"dimension": 768,
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
},
"text_embedding": {
"type": "knn_vector",
"dimension": 768,
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
}
}
}
}
```
### step 8: create a search pipeline for weighting term search and vector search
```json
PUT /_search/pipeline/nlp-search-pipeline
{
"description": "Pre and post processor for hybrid search",
"request_processors": [
{
"neural_query_enricher" : {
"description": "Sets the default model ID at index and field levels (which doesn't actually work)",
"default_model_id": <model id>,
}
}
],
"phase_results_processors": [
{
"normalization-processor": {
"normalization": {
"technique": "min_max"
},
"combination": {
"technique": "arithmetic_mean",
"parameters": {
"weights": [
0.7,
0.3
]
}
}
}
}
]
}
```
### step 9: set it as the default search pipeline
```json
PUT /item-nlp/_settings
{
"index.search.default_pipeline" : "nlp-search-pipeline"
}
```
### step 10: reindex your data if you have data
Warning: this take a very very long time.
```json
POST _reindex?wait_for_completion=false
{
"source": {
"index": "item"
},
"dest": {
"index": "item-nlp"
}
}
```
You can check the status of the reindexing with the following command:
```json
GET _tasks/<task id>
```
### step 11: search!
```json
GET /item-nlp/_search
{
"_source": {
"excludes": [
"text_embedding",
"title_embedding"
]
},
"size": 100,
"function_score": {
"query": {
"hybrid": {
"queries": [
{
"bool": {
"should": [
{
"neural": {
"title_embedding": {
"query_text": "etf bitcoin",
"model_id": <model id>,
"k": 100
}
}
},
{
"neural": {
"text_embedding": {
"query_text": "etf bitcoin",
"model_id": <model id>,
"k": 100
}
}
}
],
"filter": [
{
"range": {
"wvotes": {
"gte": 0
}
}
}
]
}
},
{
"bool": {
"should": [
{
"multi_match": {
"query": "etf bitcoin",
"type": "most_fields",
"fields": [
"title^1000",
"text"
],
"minimum_should_match": "100%",
"boost": 10
}
},
{
"multi_match": {
"query": "etf bitcoin",
"type": "most_fields",
"fields": [
"title^1000",
"text"
],
"minimum_should_match": "60%",
"boost": 1
}
}
],
"filter": [
{
"range": {
"wvotes": {
"gte": 0
}
}
}
]
}
}
]
}
},
"functions": [
{
"field_value_factor": {
"field": "wvotes",
"modifier": "none",
"factor": 1.2
}
},
{
"field_value_factor": {
"field": "ncomments",
"modifier": "ln1p",
"factor": 1
}
}
]
}
}
```

View File

@ -63,7 +63,7 @@ async function _indexItem (item, { models }) {
try { try {
await search.index({ await search.index({
id: item.id, id: item.id,
index: 'item', index: process.env.OPENSEARCH_INDEX,
version: new Date(item.updatedAt).getTime(), version: new Date(item.updatedAt).getTime(),
versionType: 'external_gte', versionType: 'external_gte',
body: itemcp body: itemcp