semantic search
This commit is contained in:
parent
75bf4aced9
commit
9af3388353
|
@ -39,6 +39,8 @@ IMGPROXY_SALT=
|
|||
OPENSEARCH_URL=http://opensearch:9200
|
||||
OPENSEARCH_USERNAME=
|
||||
OPENSEARCH_PASSWORD=
|
||||
OPENSEARCH_INDEX=item
|
||||
OPENSEARCH_MODEL_ID=
|
||||
|
||||
#######################################################
|
||||
# WALLET / OPTIONAL #
|
||||
|
|
|
@ -2,23 +2,9 @@ import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
|
|||
import { whenToFrom } from '../../lib/time'
|
||||
import { getItem } from './item'
|
||||
|
||||
const STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but',
|
||||
'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not',
|
||||
'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then',
|
||||
'there', 'these', 'they', 'this', 'to', 'was', 'will',
|
||||
'with', 'bitcoin', 'page', 'adds', 'how', 'why', 'what',
|
||||
'works', 'now', 'available', 'breaking', 'app', 'powered',
|
||||
'just', 'dev', 'using', 'crypto', 'has', 'my', 'i', 'apps',
|
||||
'really', 'new', 'era', 'application', 'best', 'year',
|
||||
'latest', 'still', 'few', 'crypto', 'keep', 'public', 'current',
|
||||
'levels', 'from', 'cryptocurrencies', 'confirmed', 'news', 'network',
|
||||
'about', 'sources', 'vote', 'considerations', 'hope',
|
||||
'keep', 'keeps', 'including', 'we', 'brings', "don't", 'do',
|
||||
'interesting', 'us', 'welcome', 'thoughts', 'results']
|
||||
|
||||
export default {
|
||||
Query: {
|
||||
related: async (parent, { title, id, cursor, limit, minMatch }, { me, models, search }) => {
|
||||
related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
|
||||
const decodedCursor = decodeCursor(cursor)
|
||||
|
||||
if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
|
||||
|
@ -31,7 +17,7 @@ export default {
|
|||
const like = []
|
||||
if (id) {
|
||||
like.push({
|
||||
_index: 'item',
|
||||
_index: process.env.OPENSEARCH_INDEX,
|
||||
_id: id
|
||||
})
|
||||
}
|
||||
|
@ -40,86 +26,98 @@ export default {
|
|||
like.push(title)
|
||||
}
|
||||
|
||||
const mustNot = []
|
||||
const mustNot = [{ exists: { field: 'parentId' } }]
|
||||
if (id) {
|
||||
mustNot.push({ term: { id } })
|
||||
}
|
||||
|
||||
let should = [
|
||||
{
|
||||
more_like_this: {
|
||||
fields: ['title', 'text'],
|
||||
like,
|
||||
min_term_freq: 1,
|
||||
min_doc_freq: 1,
|
||||
max_doc_freq: 5,
|
||||
min_word_length: 2,
|
||||
max_query_terms: 25,
|
||||
minimum_should_match: minMatch || '10%'
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
if (process.env.OPENSEARCH_MODEL_ID) {
|
||||
let qtitle = title
|
||||
let qtext = title
|
||||
if (id) {
|
||||
const item = await getItem(parent, { id }, { me, models })
|
||||
qtitle = item.title || item.text
|
||||
qtext = item.text || item.title
|
||||
}
|
||||
|
||||
should = [
|
||||
{
|
||||
neural: {
|
||||
title_embedding: {
|
||||
query_text: qtext,
|
||||
model_id: process.env.OPENSEARCH_MODEL_ID,
|
||||
k: decodedCursor.offset + LIMIT
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
neural: {
|
||||
text_embedding: {
|
||||
query_text: qtitle,
|
||||
model_id: process.env.OPENSEARCH_MODEL_ID,
|
||||
k: decodedCursor.offset + LIMIT
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
let items = await search.search({
|
||||
index: 'item',
|
||||
size: limit || LIMIT,
|
||||
index: process.env.OPENSEARCH_INDEX,
|
||||
size: limit,
|
||||
from: decodedCursor.offset,
|
||||
_source: {
|
||||
excludes: [
|
||||
'text',
|
||||
'text_embedding',
|
||||
'title_embedding'
|
||||
]
|
||||
},
|
||||
body: {
|
||||
query: {
|
||||
function_score: {
|
||||
query: {
|
||||
bool: {
|
||||
should: [
|
||||
should,
|
||||
filter: [
|
||||
{
|
||||
more_like_this: {
|
||||
fields: ['title'],
|
||||
like,
|
||||
min_term_freq: 1,
|
||||
min_doc_freq: 1,
|
||||
min_word_length: 2,
|
||||
max_query_terms: 12,
|
||||
minimum_should_match: minMatch || '80%',
|
||||
stop_words: STOP_WORDS,
|
||||
boost: 10000
|
||||
bool: {
|
||||
should: [
|
||||
{ match: { status: 'ACTIVE' } },
|
||||
{ match: { status: 'NOSATS' } }
|
||||
],
|
||||
must_not: mustNot
|
||||
}
|
||||
},
|
||||
{
|
||||
more_like_this: {
|
||||
fields: ['title'],
|
||||
like,
|
||||
min_term_freq: 1,
|
||||
min_doc_freq: 1,
|
||||
min_word_length: 2,
|
||||
max_query_terms: 12,
|
||||
minimum_should_match: minMatch || '60%',
|
||||
stop_words: STOP_WORDS,
|
||||
boost: 1000
|
||||
}
|
||||
},
|
||||
{
|
||||
more_like_this: {
|
||||
fields: ['title'],
|
||||
like,
|
||||
min_term_freq: 1,
|
||||
min_doc_freq: 1,
|
||||
min_word_length: 2,
|
||||
max_query_terms: 12,
|
||||
minimum_should_match: minMatch || '30%',
|
||||
stop_words: STOP_WORDS,
|
||||
boost: 100
|
||||
}
|
||||
},
|
||||
{
|
||||
more_like_this: {
|
||||
fields: ['text'],
|
||||
like,
|
||||
min_term_freq: 1,
|
||||
min_doc_freq: 1,
|
||||
min_word_length: 2,
|
||||
max_query_terms: 25,
|
||||
minimum_should_match: minMatch || '30%',
|
||||
stop_words: STOP_WORDS,
|
||||
boost: 10
|
||||
}
|
||||
range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
|
||||
}
|
||||
],
|
||||
must_not: [{ exists: { field: 'parentId' } }, ...mustNot],
|
||||
filter: {
|
||||
range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
field_value_factor: {
|
||||
field: 'wvotes',
|
||||
modifier: 'log1p',
|
||||
factor: 1.2,
|
||||
missing: 0
|
||||
},
|
||||
functions: [{
|
||||
field_value_factor: {
|
||||
field: 'wvotes',
|
||||
modifier: 'none',
|
||||
factor: 1,
|
||||
missing: 0
|
||||
}
|
||||
}],
|
||||
boost_mode: 'multiply'
|
||||
}
|
||||
}
|
||||
|
@ -177,24 +175,14 @@ export default {
|
|||
whatArr.push({ match: { 'sub.name': sub } })
|
||||
}
|
||||
|
||||
const should = [
|
||||
let termQueries = [
|
||||
{
|
||||
// all terms are matched in fields
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'most_fields',
|
||||
fields: ['title^1000', 'text'],
|
||||
type: 'best_fields',
|
||||
fields: ['title^100', 'text'],
|
||||
minimum_should_match: '100%',
|
||||
boost: 10000
|
||||
}
|
||||
},
|
||||
{
|
||||
// all terms are matched in fields fuzzily
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'most_fields',
|
||||
fields: ['title^1000', 'text'],
|
||||
minimum_should_match: '60%',
|
||||
boost: 1000
|
||||
}
|
||||
}
|
||||
|
@ -232,35 +220,87 @@ export default {
|
|||
}
|
||||
]
|
||||
|
||||
// allow fuzzy matching for single terms
|
||||
if (sort !== 'recent') {
|
||||
should.push({
|
||||
// only some terms must match unless we're sorting
|
||||
if (sort === 'recent') {
|
||||
// prioritize exact matches
|
||||
termQueries.push({
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'phrase',
|
||||
fields: ['title^100', 'text'],
|
||||
boost: 1000
|
||||
}
|
||||
})
|
||||
} else {
|
||||
// allow fuzzy matching with partial matches
|
||||
termQueries.push({
|
||||
multi_match: {
|
||||
query,
|
||||
type: 'most_fields',
|
||||
fields: ['title^1000', 'text'],
|
||||
fields: ['title^100', 'text'],
|
||||
fuzziness: 'AUTO',
|
||||
prefix_length: 3,
|
||||
minimum_should_match: '60%'
|
||||
}
|
||||
})
|
||||
// small bias toward posts with comments
|
||||
functions.push({
|
||||
// small bias toward posts with comments
|
||||
field_value_factor: {
|
||||
field: 'ncomments',
|
||||
modifier: 'ln1p',
|
||||
factor: 1
|
||||
}
|
||||
},
|
||||
{
|
||||
// small bias toward recent posts
|
||||
field_value_factor: {
|
||||
field: 'createdAt',
|
||||
modifier: 'log1p',
|
||||
factor: 1
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
if (query.length) {
|
||||
whatArr.push({
|
||||
bool: {
|
||||
should
|
||||
// if we have a model id and we aren't sort by recent, use neural search
|
||||
if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
|
||||
termQueries = {
|
||||
hybrid: {
|
||||
queries: [
|
||||
{
|
||||
bool: {
|
||||
should: [
|
||||
{
|
||||
neural: {
|
||||
title_embedding: {
|
||||
query_text: query,
|
||||
model_id: process.env.OPENSEARCH_MODEL_ID,
|
||||
k: decodedCursor.offset + LIMIT
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
neural: {
|
||||
text_embedding: {
|
||||
query_text: query,
|
||||
model_id: process.env.OPENSEARCH_MODEL_ID,
|
||||
k: decodedCursor.offset + LIMIT
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
bool: {
|
||||
should: termQueries
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
} else {
|
||||
termQueries = []
|
||||
}
|
||||
|
||||
const whenRange = when === 'custom'
|
||||
|
@ -275,15 +315,23 @@ export default {
|
|||
|
||||
try {
|
||||
sitems = await search.search({
|
||||
index: 'item',
|
||||
index: process.env.OPENSEARCH_INDEX,
|
||||
size: LIMIT,
|
||||
_source: {
|
||||
excludes: [
|
||||
'text',
|
||||
'text_embedding',
|
||||
'title_embedding'
|
||||
]
|
||||
},
|
||||
from: decodedCursor.offset,
|
||||
body: {
|
||||
query: {
|
||||
function_score: {
|
||||
query: {
|
||||
bool: {
|
||||
must: [
|
||||
...(sort === 'recent' ? { must: termQueries } : { should: termQueries }),
|
||||
filter: [
|
||||
...whatArr,
|
||||
me
|
||||
? {
|
||||
|
@ -302,9 +350,7 @@ export default {
|
|||
{ match: { status: 'NOSATS' } }
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
filter: [
|
||||
},
|
||||
{
|
||||
range:
|
||||
{
|
||||
|
|
|
@ -1,14 +1,12 @@
|
|||
import os from '@opensearch-project/opensearch'
|
||||
|
||||
const options = process.env.NODE_ENV === 'development'
|
||||
? { node: process.env.OPENSEARCH_URL || 'http://localhost:9200' }
|
||||
: {
|
||||
node: process.env.OPENSEARCH_URL,
|
||||
auth: {
|
||||
username: process.env.OPENSEARCH_USERNAME,
|
||||
password: process.env.OPENSEARCH_PASSWORD
|
||||
}
|
||||
}
|
||||
const options = {
|
||||
node: process.env.OPENSEARCH_URL,
|
||||
auth: {
|
||||
username: process.env.OPENSEARCH_USERNAME,
|
||||
password: process.env.OPENSEARCH_PASSWORD
|
||||
}
|
||||
}
|
||||
|
||||
global.os = global.os || new os.Client(options)
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ export default function Related ({ title, itemId }) {
|
|||
const variables = { title, id: itemId, limit: LIMIT }
|
||||
return (
|
||||
<AccordianItem
|
||||
header={<div className='fw-bold'>related</div>}
|
||||
header={<div className='fw-bold'>related posts</div>}
|
||||
body={
|
||||
<Items
|
||||
query={RELATED_ITEMS}
|
||||
|
|
|
@ -0,0 +1,308 @@
|
|||
Getting semantic search setup in OpenSearch is a multistep process.
|
||||
|
||||
### step 1: configure the ml plugin
|
||||
```json
|
||||
PUT _cluster/settings
|
||||
{
|
||||
"persistent": {
|
||||
"plugins.ml_commons.only_run_on_ml_node": "false",
|
||||
"plugins.ml_commons.model_access_control_enabled": "true",
|
||||
"plugins.ml_commons.native_memory_threshold": "99"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### step 2: create a model group
|
||||
```json
|
||||
POST /_plugins/_ml/model_groups/_register
|
||||
{
|
||||
"name": "local_model_group",
|
||||
"description": "A model group for local models"
|
||||
}
|
||||
```
|
||||
|
||||
### step 3: register a pretained model to the model group
|
||||
Importantly, we need to use a model that truncates input. Note the feature number of the model you're using, because we'll need to store those features. For example, the model below has 768 features.
|
||||
|
||||
```json
|
||||
POST /_plugins/_ml/models/_register
|
||||
{
|
||||
"name": "huggingface/sentence-transformers/all-mpnet-base-v2",
|
||||
"version": "1.0.1",
|
||||
"model_group_id": <model group id>,
|
||||
"model_format": "TORCH_SCRIPT"
|
||||
}
|
||||
```
|
||||
|
||||
### step 4: wait until the model registration is complete
|
||||
```json
|
||||
GET /_plugins/_ml/tasks/<task id from above>
|
||||
```
|
||||
|
||||
### step 5: deploy the model
|
||||
Note the model id
|
||||
```json
|
||||
POST /_plugins/_ml/models/<model id>/_deploy
|
||||
```
|
||||
|
||||
### step 6: create an ingest pipeline
|
||||
Most models choke on empty strings, so we remove them at an earlier stage in the pipeline. We also add the model to the pipeline which generates the embeddings.
|
||||
|
||||
```json
|
||||
PUT /_ingest/pipeline/nlp-ingest-pipeline
|
||||
{
|
||||
"description": "An NLP ingest pipeline",
|
||||
"processors": [
|
||||
{
|
||||
"remove": {
|
||||
"field": "text",
|
||||
"if": "ctx?.text?.trim() == ''"
|
||||
}
|
||||
},
|
||||
{
|
||||
"remove": {
|
||||
"field": "title",
|
||||
"if": "ctx?.title?.trim() == ''"
|
||||
}
|
||||
},
|
||||
{
|
||||
"text_embedding": {
|
||||
"model_id": "6whlBY0B2sj1ObjeeD5d",
|
||||
"field_map": {
|
||||
"text": "text_embedding",
|
||||
"title": "title_embedding"
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### step 7: create a new index with the knn_vector type
|
||||
We'll need to create mappings for the embeddings which is also a convenient time to specifiy special analyzers for the text and title fields.
|
||||
|
||||
```json
|
||||
PUT /item-nlp
|
||||
{
|
||||
"settings": {
|
||||
"index.knn": true,
|
||||
"default_pipeline": "nlp-ingest-pipeline"
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "text",
|
||||
"analyzer": "english",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"title": {
|
||||
"type": "text",
|
||||
"analyzer": "english",
|
||||
"fields": {
|
||||
"keyword": {
|
||||
"type": "keyword",
|
||||
"ignore_above": 256
|
||||
}
|
||||
}
|
||||
},
|
||||
"title_embedding": {
|
||||
"type": "knn_vector",
|
||||
"dimension": 768,
|
||||
"method": {
|
||||
"engine": "lucene",
|
||||
"space_type": "l2",
|
||||
"name": "hnsw",
|
||||
"parameters": {}
|
||||
}
|
||||
},
|
||||
"text_embedding": {
|
||||
"type": "knn_vector",
|
||||
"dimension": 768,
|
||||
"method": {
|
||||
"engine": "lucene",
|
||||
"space_type": "l2",
|
||||
"name": "hnsw",
|
||||
"parameters": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### step 8: create a search pipeline for weighting term search and vector search
|
||||
```json
|
||||
PUT /_search/pipeline/nlp-search-pipeline
|
||||
{
|
||||
"description": "Pre and post processor for hybrid search",
|
||||
"request_processors": [
|
||||
{
|
||||
"neural_query_enricher" : {
|
||||
"description": "Sets the default model ID at index and field levels (which doesn't actually work)",
|
||||
"default_model_id": <model id>,
|
||||
}
|
||||
}
|
||||
],
|
||||
"phase_results_processors": [
|
||||
{
|
||||
"normalization-processor": {
|
||||
"normalization": {
|
||||
"technique": "min_max"
|
||||
},
|
||||
"combination": {
|
||||
"technique": "arithmetic_mean",
|
||||
"parameters": {
|
||||
"weights": [
|
||||
0.7,
|
||||
0.3
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### step 9: set it as the default search pipeline
|
||||
```json
|
||||
PUT /item-nlp/_settings
|
||||
{
|
||||
"index.search.default_pipeline" : "nlp-search-pipeline"
|
||||
}
|
||||
```
|
||||
|
||||
### step 10: reindex your data if you have data
|
||||
Warning: this take a very very long time.
|
||||
```json
|
||||
POST _reindex?wait_for_completion=false
|
||||
{
|
||||
"source": {
|
||||
"index": "item"
|
||||
},
|
||||
"dest": {
|
||||
"index": "item-nlp"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
You can check the status of the reindexing with the following command:
|
||||
```json
|
||||
GET _tasks/<task id>
|
||||
```
|
||||
|
||||
### step 11: search!
|
||||
```json
|
||||
GET /item-nlp/_search
|
||||
{
|
||||
"_source": {
|
||||
"excludes": [
|
||||
"text_embedding",
|
||||
"title_embedding"
|
||||
]
|
||||
},
|
||||
"size": 100,
|
||||
"function_score": {
|
||||
"query": {
|
||||
"hybrid": {
|
||||
"queries": [
|
||||
{
|
||||
"bool": {
|
||||
"should": [
|
||||
{
|
||||
"neural": {
|
||||
"title_embedding": {
|
||||
"query_text": "etf bitcoin",
|
||||
"model_id": <model id>,
|
||||
"k": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"neural": {
|
||||
"text_embedding": {
|
||||
"query_text": "etf bitcoin",
|
||||
"model_id": <model id>,
|
||||
"k": 100
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"filter": [
|
||||
{
|
||||
"range": {
|
||||
"wvotes": {
|
||||
"gte": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"bool": {
|
||||
"should": [
|
||||
{
|
||||
"multi_match": {
|
||||
"query": "etf bitcoin",
|
||||
"type": "most_fields",
|
||||
"fields": [
|
||||
"title^1000",
|
||||
"text"
|
||||
],
|
||||
"minimum_should_match": "100%",
|
||||
"boost": 10
|
||||
}
|
||||
},
|
||||
{
|
||||
"multi_match": {
|
||||
"query": "etf bitcoin",
|
||||
"type": "most_fields",
|
||||
"fields": [
|
||||
"title^1000",
|
||||
"text"
|
||||
],
|
||||
"minimum_should_match": "60%",
|
||||
"boost": 1
|
||||
}
|
||||
}
|
||||
],
|
||||
"filter": [
|
||||
{
|
||||
"range": {
|
||||
"wvotes": {
|
||||
"gte": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"functions": [
|
||||
{
|
||||
"field_value_factor": {
|
||||
"field": "wvotes",
|
||||
"modifier": "none",
|
||||
"factor": 1.2
|
||||
}
|
||||
},
|
||||
{
|
||||
"field_value_factor": {
|
||||
"field": "ncomments",
|
||||
"modifier": "ln1p",
|
||||
"factor": 1
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
|
@ -63,7 +63,7 @@ async function _indexItem (item, { models }) {
|
|||
try {
|
||||
await search.index({
|
||||
id: item.id,
|
||||
index: 'item',
|
||||
index: process.env.OPENSEARCH_INDEX,
|
||||
version: new Date(item.updatedAt).getTime(),
|
||||
versionType: 'external_gte',
|
||||
body: itemcp
|
||||
|
|
Loading…
Reference in New Issue