semantic search

2024-01-15 17:22:32 -06:00 · 2024-01-15 17:22:32 -06:00 · 9af3388353
commit 9af3388353
parent 75bf4aced9
6 changed files with 471 additions and 117 deletions
--- a/.env.sample
+++ b/.env.sample
@ -39,6 +39,8 @@ IMGPROXY_SALT=
 OPENSEARCH_URL=http://opensearch:9200
 OPENSEARCH_USERNAME=
 OPENSEARCH_PASSWORD=
 OPENSEARCH_INDEX=item
 OPENSEARCH_MODEL_ID=
 #######################################################
 # WALLET / OPTIONAL                                   #
--- a/api/resolvers/search.js
+++ b/api/resolvers/search.js
@ -2,23 +2,9 @@ import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
 import { whenToFrom } from '../../lib/time'
 import { getItem } from './item'
 const STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but',
  'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not',
  'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then',
  'there', 'these', 'they', 'this', 'to', 'was', 'will',
  'with', 'bitcoin', 'page', 'adds', 'how', 'why', 'what',
  'works', 'now', 'available', 'breaking', 'app', 'powered',
  'just', 'dev', 'using', 'crypto', 'has', 'my', 'i', 'apps',
  'really', 'new', 'era', 'application', 'best', 'year',
  'latest', 'still', 'few', 'crypto', 'keep', 'public', 'current',
  'levels', 'from', 'cryptocurrencies', 'confirmed', 'news', 'network',
  'about', 'sources', 'vote', 'considerations', 'hope',
  'keep', 'keeps', 'including', 'we', 'brings', "don't", 'do',
  'interesting', 'us', 'welcome', 'thoughts', 'results']
 export default {
  Query: {
-    related: async (parent, { title, id, cursor, limit, minMatch }, { me, models, search }) => {
+    related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
      const decodedCursor = decodeCursor(cursor)
      if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
@ -31,7 +17,7 @@ export default {
      const like = []
      if (id) {
        like.push({
-          _index: 'item',
+          _index: process.env.OPENSEARCH_INDEX,
          _id: id
        })
      }
@ -40,86 +26,98 @@ export default {
        like.push(title)
      }
-      const mustNot = []
+      const mustNot = [{ exists: { field: 'parentId' } }]
      if (id) {
        mustNot.push({ term: { id } })
      }
      let should = [
        {
          more_like_this: {
            fields: ['title', 'text'],
            like,
            min_term_freq: 1,
            min_doc_freq: 1,
            max_doc_freq: 5,
            min_word_length: 2,
            max_query_terms: 25,
            minimum_should_match: minMatch || '10%'
          }
        }
      ]
      if (process.env.OPENSEARCH_MODEL_ID) {
        let qtitle = title
        let qtext = title
        if (id) {
          const item = await getItem(parent, { id }, { me, models })
          qtitle = item.title || item.text
          qtext = item.text || item.title
        }
        should = [
          {
            neural: {
              title_embedding: {
                query_text: qtext,
                model_id: process.env.OPENSEARCH_MODEL_ID,
                k: decodedCursor.offset + LIMIT
              }
            }
          },
          {
            neural: {
              text_embedding: {
                query_text: qtitle,
                model_id: process.env.OPENSEARCH_MODEL_ID,
                k: decodedCursor.offset + LIMIT
              }
            }
          }
        ]
      }
      let items = await search.search({
-        index: 'item',
+        index: process.env.OPENSEARCH_INDEX,
-        size: limit || LIMIT,
+        size: limit,
        from: decodedCursor.offset,
        _source: {
          excludes: [
            'text',
            'text_embedding',
            'title_embedding'
          ]
        },
        body: {
          query: {
            function_score: {
              query: {
                bool: {
                  should,
                  filter: [
                    {
                      bool: {
                        should: [
-                    {
+                          { match: { status: 'ACTIVE' } },
-                      more_like_this: {
+                          { match: { status: 'NOSATS' } }
                        fields: ['title'],
                        like,
                        min_term_freq: 1,
                        min_doc_freq: 1,
                        min_word_length: 2,
                        max_query_terms: 12,
                        minimum_should_match: minMatch || '80%',
                        stop_words: STOP_WORDS,
                        boost: 10000
                      }
                    },
                    {
                      more_like_this: {
                        fields: ['title'],
                        like,
                        min_term_freq: 1,
                        min_doc_freq: 1,
                        min_word_length: 2,
                        max_query_terms: 12,
                        minimum_should_match: minMatch || '60%',
                        stop_words: STOP_WORDS,
                        boost: 1000
                      }
                    },
                    {
                      more_like_this: {
                        fields: ['title'],
                        like,
                        min_term_freq: 1,
                        min_doc_freq: 1,
                        min_word_length: 2,
                        max_query_terms: 12,
                        minimum_should_match: minMatch || '30%',
                        stop_words: STOP_WORDS,
                        boost: 100
                      }
                    },
                    {
                      more_like_this: {
                        fields: ['text'],
                        like,
                        min_term_freq: 1,
                        min_doc_freq: 1,
                        min_word_length: 2,
                        max_query_terms: 25,
                        minimum_should_match: minMatch || '30%',
                        stop_words: STOP_WORDS,
                        boost: 10
                      }
                    }
                        ],
-                  must_not: [{ exists: { field: 'parentId' } }, ...mustNot],
+                        must_not: mustNot
-                  filter: {
+                      }
                    },
                    {
                      range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
                    }
                  ]
                }
              },
              functions: [{
                field_value_factor: {
                  field: 'wvotes',
-                modifier: 'log1p',
+                  modifier: 'none',
-                factor: 1.2,
+                  factor: 1,
                  missing: 0
-              },
+                }
              }],
              boost_mode: 'multiply'
            }
          }
@ -177,24 +175,14 @@ export default {
        whatArr.push({ match: { 'sub.name': sub } })
      }
-      const should = [
+      let termQueries = [
        {
          // all terms are matched in fields
          multi_match: {
            query,
-            type: 'most_fields',
+            type: 'best_fields',
-            fields: ['title^1000', 'text'],
+            fields: ['title^100', 'text'],
            minimum_should_match: '100%',
            boost: 10000
          }
        },
        {
          // all terms are matched in fields fuzzily
          multi_match: {
            query,
            type: 'most_fields',
            fields: ['title^1000', 'text'],
            minimum_should_match: '60%',
            boost: 1000
          }
        }
@ -232,35 +220,87 @@ export default {
        }
      ]
-      // allow fuzzy matching for single terms
+      if (sort === 'recent') {
-      if (sort !== 'recent') {
+        // prioritize exact matches
-        should.push({
+        termQueries.push({
-          // only some terms must match unless we're sorting
+          multi_match: {
            query,
            type: 'phrase',
            fields: ['title^100', 'text'],
            boost: 1000
          }
        })
      } else {
        // allow fuzzy matching with partial matches
        termQueries.push({
          multi_match: {
            query,
            type: 'most_fields',
-            fields: ['title^1000', 'text'],
+            fields: ['title^100', 'text'],
            fuzziness: 'AUTO',
            prefix_length: 3,
            minimum_should_match: '60%'
          }
        })
        // small bias toward posts with comments
        functions.push({
          // small bias toward posts with comments
          field_value_factor: {
            field: 'ncomments',
            modifier: 'ln1p',
            factor: 1
          }
        },
        {
          // small bias toward recent posts
          field_value_factor: {
            field: 'createdAt',
            modifier: 'log1p',
            factor: 1
          }
        })
      }
      if (query.length) {
-        whatArr.push({
+        // if we have a model id and we aren't sort by recent, use neural search
        if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
          termQueries = {
            hybrid: {
              queries: [
                {
                  bool: {
-            should
+                    should: [
                      {
                        neural: {
                          title_embedding: {
                            query_text: query,
                            model_id: process.env.OPENSEARCH_MODEL_ID,
                            k: decodedCursor.offset + LIMIT
                          }
-        })
+                        }
                      },
                      {
                        neural: {
                          text_embedding: {
                            query_text: query,
                            model_id: process.env.OPENSEARCH_MODEL_ID,
                            k: decodedCursor.offset + LIMIT
                          }
                        }
                      }
                    ]
                  }
                },
                {
                  bool: {
                    should: termQueries
                  }
                }
              ]
            }
          }
        }
      } else {
        termQueries = []
      }
      const whenRange = when === 'custom'
@ -275,15 +315,23 @@ export default {
      try {
        sitems = await search.search({
-          index: 'item',
+          index: process.env.OPENSEARCH_INDEX,
          size: LIMIT,
          _source: {
            excludes: [
              'text',
              'text_embedding',
              'title_embedding'
            ]
          },
          from: decodedCursor.offset,
          body: {
            query: {
              function_score: {
                query: {
                  bool: {
-                    must: [
+                    ...(sort === 'recent' ? { must: termQueries } : { should: termQueries }),
                    filter: [
                      ...whatArr,
                      me
                        ? {
@ -302,9 +350,7 @@ export default {
                                { match: { status: 'NOSATS' } }
                              ]
                            }
-                          }
+                          },
                    ],
                    filter: [
                      {
                        range:
                        {
--- a/api/search/index.js
+++ b/api/search/index.js
@ -1,8 +1,6 @@
 import os from '@opensearch-project/opensearch'
-const options = process.env.NODE_ENV === 'development'
+const options = {
  ? { node: process.env.OPENSEARCH_URL || 'http://localhost:9200' }
  : {
  node: process.env.OPENSEARCH_URL,
  auth: {
    username: process.env.OPENSEARCH_USERNAME,
--- a/components/related.js
+++ b/components/related.js
@ -9,7 +9,7 @@ export default function Related ({ title, itemId }) {
  const variables = { title, id: itemId, limit: LIMIT }
  return (
    <AccordianItem
-      header={<div className='fw-bold'>related</div>}
+      header={<div className='fw-bold'>related posts</div>}
      body={
        <Items
          query={RELATED_ITEMS}
--- a/docs/semantic-search.md
+++ b/docs/semantic-search.md
@ -0,0 +1,308 @@
 Getting semantic search setup in OpenSearch is a multistep process.
 ### step 1: configure the ml plugin
 ```json
 PUT _cluster/settings
 {
  "persistent": {
        "plugins.ml_commons.only_run_on_ml_node": "false",
        "plugins.ml_commons.model_access_control_enabled": "true",
        "plugins.ml_commons.native_memory_threshold": "99"
      }
 }
 ```
 ### step 2: create a model group
 ```json
 POST /_plugins/_ml/model_groups/_register
 {
  "name": "local_model_group",
  "description": "A model group for local models"
 }
 ```
 ### step 3: register a pretained model to the model group
 Importantly, we need to use a model that truncates input. Note the feature number of the model you're using, because we'll need to store those features. For example, the model below has 768 features.
 ```json
 POST /_plugins/_ml/models/_register
 {
  "name": "huggingface/sentence-transformers/all-mpnet-base-v2",
  "version": "1.0.1",
  "model_group_id": <model group id>,
  "model_format": "TORCH_SCRIPT"
 }
 ```
 ### step 4: wait until the model registration is complete
 ```json
 GET /_plugins/_ml/tasks/<task id from above>
 ```
 ### step 5: deploy the model
 Note the model id
 ```json
 POST /_plugins/_ml/models/<model id>/_deploy
 ```
 ### step 6: create an ingest pipeline
 Most models choke on empty strings, so we remove them at an earlier stage in the pipeline. We also add the model to the pipeline which generates the embeddings.
 ```json
 PUT /_ingest/pipeline/nlp-ingest-pipeline
 {
  "description": "An NLP ingest pipeline",
  "processors": [
    {
      "remove": {
        "field": "text",
        "if": "ctx?.text?.trim() == ''"
      }
    },
    {
      "remove": {
        "field": "title",
        "if": "ctx?.title?.trim() == ''"
      }
    },
    {
      "text_embedding": {
        "model_id": "6whlBY0B2sj1ObjeeD5d",
        "field_map": {
          "text": "text_embedding",
          "title": "title_embedding"
        }
      }
    }
  ]
 }
 ```
 ### step 7: create a new index with the knn_vector type
 We'll need to create mappings for the embeddings which is also a convenient time to specifiy special analyzers for the text and title fields.
 ```json
 PUT /item-nlp
 {
  "settings": {
    "index.knn": true,
    "default_pipeline": "nlp-ingest-pipeline"
  },
  "mappings": {
    "properties": {
      "text": {
        "type": "text",
        "analyzer": "english",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "title": {
        "type": "text",
        "analyzer": "english",
        "fields": {
          "keyword": {
            "type": "keyword",
            "ignore_above": 256
          }
        }
      },
      "title_embedding": {
        "type": "knn_vector",
        "dimension": 768,
        "method": {
          "engine": "lucene",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      },
      "text_embedding": {
        "type": "knn_vector",
        "dimension": 768,
        "method": {
          "engine": "lucene",
          "space_type": "l2",
          "name": "hnsw",
          "parameters": {}
        }
      }
    }
  }
 }
 ```
 ### step 8: create a search pipeline for weighting term search and vector search
 ```json
 PUT /_search/pipeline/nlp-search-pipeline
 {
  "description": "Pre and post processor for hybrid search",
  "request_processors": [
    {
      "neural_query_enricher" : {
        "description": "Sets the default model ID at index and field levels (which doesn't actually work)",
        "default_model_id": <model id>,
      }
    }
  ],
  "phase_results_processors": [
    {
      "normalization-processor": {
        "normalization": {
          "technique": "min_max"
        },
        "combination": {
          "technique": "arithmetic_mean",
          "parameters": {
            "weights": [
              0.7,
              0.3
            ]
          }
        }
      }
    }
  ]
 }
 ```
 ### step 9: set it as the default search pipeline
 ```json
 PUT /item-nlp/_settings
 {
  "index.search.default_pipeline" : "nlp-search-pipeline"
 }
 ```
 ### step 10: reindex your data if you have data
 Warning: this take a very very long time.
 ```json
 POST _reindex?wait_for_completion=false
 {
  "source": {
    "index": "item"
  },
  "dest": {
    "index": "item-nlp"
  }
 }
 ```
 You can check the status of the reindexing with the following command:
 ```json
 GET _tasks/<task id>
 ```
 ### step 11: search!
 ```json
 GET /item-nlp/_search
 {
  "_source": {
    "excludes": [
      "text_embedding",
      "title_embedding"
    ]
  },
  "size": 100,
  "function_score": {
    "query": {
      "hybrid": {
        "queries": [
          {
            "bool": {
              "should": [
                {
                  "neural": {
                    "title_embedding": {
                      "query_text": "etf bitcoin",
                      "model_id": <model id>,
                      "k": 100
                    }
                  }
                },
                {
                  "neural": {
                    "text_embedding": {
                      "query_text": "etf bitcoin",
                      "model_id": <model id>,
                      "k": 100
                    }
                  }
                }
              ],
              "filter": [
                {
                  "range": {
                    "wvotes": {
                      "gte": 0
                    }
                  }
                }
              ]
            }
          },
          {
            "bool": {
              "should": [
                {
                  "multi_match": {
                    "query": "etf bitcoin",
                    "type": "most_fields",
                    "fields": [
                      "title^1000",
                      "text"
                    ],
                    "minimum_should_match": "100%",
                    "boost": 10
                  }
                },
                {
                  "multi_match": {
                    "query": "etf bitcoin",
                    "type": "most_fields",
                    "fields": [
                      "title^1000",
                      "text"
                    ],
                    "minimum_should_match": "60%",
                    "boost": 1
                  }
                }
              ],
              "filter": [
                {
                  "range": {
                    "wvotes": {
                      "gte": 0
                    }
                  }
                }
              ]
            }
          }
        ]
      }
    },
    "functions": [
      {
        "field_value_factor": {
          "field": "wvotes",
          "modifier": "none",
          "factor": 1.2
        }
      },
      {
        "field_value_factor": {
          "field": "ncomments",
          "modifier": "ln1p",
          "factor": 1
        }
      }
    ]
  }
 }
 ```
--- a/worker/search.js
+++ b/worker/search.js
@ -63,7 +63,7 @@ async function _indexItem (item, { models }) {
  try {
    await search.index({
      id: item.id,
-      index: 'item',
+      index: process.env.OPENSEARCH_INDEX,
      version: new Date(item.updatedAt).getTime(),
      versionType: 'external_gte',
      body: itemcp