From 9af338835329585b9244a54098cdd666b2964eec Mon Sep 17 00:00:00 2001
From: keyan <keyan.kousha+huumn@gmail.com>
Date: Mon, 15 Jan 2024 17:22:32 -0600
Subject: [PATCH] semantic search

---
 .env.sample             |   2 +
 api/resolvers/search.js | 258 +++++++++++++++++++--------------
 api/search/index.js     |  16 +--
 components/related.js   |   2 +-
 docs/semantic-search.md | 308 ++++++++++++++++++++++++++++++++++++++++
 worker/search.js        |   2 +-
 6 files changed, 471 insertions(+), 117 deletions(-)
 create mode 100644 docs/semantic-search.md

diff --git a/.env.sample b/.env.sample
index ef53d7fe..79f31442 100644
--- a/.env.sample
+++ b/.env.sample
@@ -39,6 +39,8 @@ IMGPROXY_SALT=
 OPENSEARCH_URL=http://opensearch:9200
 OPENSEARCH_USERNAME=
 OPENSEARCH_PASSWORD=
+OPENSEARCH_INDEX=item
+OPENSEARCH_MODEL_ID=
 
 #######################################################
 # WALLET / OPTIONAL                                   #
diff --git a/api/resolvers/search.js b/api/resolvers/search.js
index 10bb166f..ff785e5b 100644
--- a/api/resolvers/search.js
+++ b/api/resolvers/search.js
@@ -2,23 +2,9 @@ import { decodeCursor, LIMIT, nextCursorEncoded } from '../../lib/cursor'
 import { whenToFrom } from '../../lib/time'
 import { getItem } from './item'
 
-const STOP_WORDS = ['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but',
-  'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not',
-  'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then',
-  'there', 'these', 'they', 'this', 'to', 'was', 'will',
-  'with', 'bitcoin', 'page', 'adds', 'how', 'why', 'what',
-  'works', 'now', 'available', 'breaking', 'app', 'powered',
-  'just', 'dev', 'using', 'crypto', 'has', 'my', 'i', 'apps',
-  'really', 'new', 'era', 'application', 'best', 'year',
-  'latest', 'still', 'few', 'crypto', 'keep', 'public', 'current',
-  'levels', 'from', 'cryptocurrencies', 'confirmed', 'news', 'network',
-  'about', 'sources', 'vote', 'considerations', 'hope',
-  'keep', 'keeps', 'including', 'we', 'brings', "don't", 'do',
-  'interesting', 'us', 'welcome', 'thoughts', 'results']
-
 export default {
   Query: {
-    related: async (parent, { title, id, cursor, limit, minMatch }, { me, models, search }) => {
+    related: async (parent, { title, id, cursor, limit = LIMIT, minMatch }, { me, models, search }) => {
       const decodedCursor = decodeCursor(cursor)
 
       if (!id && (!title || title.trim().split(/\s+/).length < 1)) {
@@ -31,7 +17,7 @@ export default {
       const like = []
       if (id) {
         like.push({
-          _index: 'item',
+          _index: process.env.OPENSEARCH_INDEX,
           _id: id
         })
       }
@@ -40,86 +26,98 @@ export default {
         like.push(title)
       }
 
-      const mustNot = []
+      const mustNot = [{ exists: { field: 'parentId' } }]
       if (id) {
         mustNot.push({ term: { id } })
       }
 
+      let should = [
+        {
+          more_like_this: {
+            fields: ['title', 'text'],
+            like,
+            min_term_freq: 1,
+            min_doc_freq: 1,
+            max_doc_freq: 5,
+            min_word_length: 2,
+            max_query_terms: 25,
+            minimum_should_match: minMatch || '10%'
+          }
+        }
+      ]
+
+      if (process.env.OPENSEARCH_MODEL_ID) {
+        let qtitle = title
+        let qtext = title
+        if (id) {
+          const item = await getItem(parent, { id }, { me, models })
+          qtitle = item.title || item.text
+          qtext = item.text || item.title
+        }
+
+        should = [
+          {
+            neural: {
+              title_embedding: {
+                query_text: qtext,
+                model_id: process.env.OPENSEARCH_MODEL_ID,
+                k: decodedCursor.offset + LIMIT
+              }
+            }
+          },
+          {
+            neural: {
+              text_embedding: {
+                query_text: qtitle,
+                model_id: process.env.OPENSEARCH_MODEL_ID,
+                k: decodedCursor.offset + LIMIT
+              }
+            }
+          }
+        ]
+      }
+
       let items = await search.search({
-        index: 'item',
-        size: limit || LIMIT,
+        index: process.env.OPENSEARCH_INDEX,
+        size: limit,
         from: decodedCursor.offset,
+        _source: {
+          excludes: [
+            'text',
+            'text_embedding',
+            'title_embedding'
+          ]
+        },
         body: {
           query: {
             function_score: {
               query: {
                 bool: {
-                  should: [
+                  should,
+                  filter: [
                     {
-                      more_like_this: {
-                        fields: ['title'],
-                        like,
-                        min_term_freq: 1,
-                        min_doc_freq: 1,
-                        min_word_length: 2,
-                        max_query_terms: 12,
-                        minimum_should_match: minMatch || '80%',
-                        stop_words: STOP_WORDS,
-                        boost: 10000
+                      bool: {
+                        should: [
+                          { match: { status: 'ACTIVE' } },
+                          { match: { status: 'NOSATS' } }
+                        ],
+                        must_not: mustNot
                       }
                     },
                     {
-                      more_like_this: {
-                        fields: ['title'],
-                        like,
-                        min_term_freq: 1,
-                        min_doc_freq: 1,
-                        min_word_length: 2,
-                        max_query_terms: 12,
-                        minimum_should_match: minMatch || '60%',
-                        stop_words: STOP_WORDS,
-                        boost: 1000
-                      }
-                    },
-                    {
-                      more_like_this: {
-                        fields: ['title'],
-                        like,
-                        min_term_freq: 1,
-                        min_doc_freq: 1,
-                        min_word_length: 2,
-                        max_query_terms: 12,
-                        minimum_should_match: minMatch || '30%',
-                        stop_words: STOP_WORDS,
-                        boost: 100
-                      }
-                    },
-                    {
-                      more_like_this: {
-                        fields: ['text'],
-                        like,
-                        min_term_freq: 1,
-                        min_doc_freq: 1,
-                        min_word_length: 2,
-                        max_query_terms: 25,
-                        minimum_should_match: minMatch || '30%',
-                        stop_words: STOP_WORDS,
-                        boost: 10
-                      }
+                      range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
                     }
-                  ],
-                  must_not: [{ exists: { field: 'parentId' } }, ...mustNot],
-                  filter: {
-                    range: { wvotes: { gte: minMatch ? 0 : 0.2 } }
-                  }
+                  ]
                 }
               },
-              field_value_factor: {
-                field: 'wvotes',
-                modifier: 'log1p',
-                factor: 1.2,
-                missing: 0
-              },
+              functions: [{
+                field_value_factor: {
+                  field: 'wvotes',
+                  modifier: 'none',
+                  factor: 1,
+                  missing: 0
+                }
+              }],
               boost_mode: 'multiply'
             }
           }
@@ -177,24 +175,14 @@ export default {
         whatArr.push({ match: { 'sub.name': sub } })
       }
 
-      const should = [
+      let termQueries = [
         {
           // all terms are matched in fields
           multi_match: {
             query,
-            type: 'most_fields',
-            fields: ['title^1000', 'text'],
+            type: 'best_fields',
+            fields: ['title^100', 'text'],
             minimum_should_match: '100%',
-            boost: 10000
-          }
-        },
-        {
-          // all terms are matched in fields fuzzily
-          multi_match: {
-            query,
-            type: 'most_fields',
-            fields: ['title^1000', 'text'],
-            minimum_should_match: '60%',
             boost: 1000
           }
         }
@@ -232,35 +220,87 @@ export default {
         }
       ]
 
-      // allow fuzzy matching for single terms
-      if (sort !== 'recent') {
-        should.push({
-          // only some terms must match unless we're sorting
+      if (sort === 'recent') {
+        // prioritize exact matches
+        termQueries.push({
+          multi_match: {
+            query,
+            type: 'phrase',
+            fields: ['title^100', 'text'],
+            boost: 1000
+          }
+        })
+      } else {
+        // allow fuzzy matching with partial matches
+        termQueries.push({
           multi_match: {
             query,
             type: 'most_fields',
-            fields: ['title^1000', 'text'],
+            fields: ['title^100', 'text'],
             fuzziness: 'AUTO',
             prefix_length: 3,
             minimum_should_match: '60%'
           }
         })
-        // small bias toward posts with comments
         functions.push({
+          // small bias toward posts with comments
           field_value_factor: {
             field: 'ncomments',
             modifier: 'ln1p',
             factor: 1
           }
+        },
+        {
+          // small bias toward recent posts
+          field_value_factor: {
+            field: 'createdAt',
+            modifier: 'log1p',
+            factor: 1
+          }
         })
       }
 
       if (query.length) {
-        whatArr.push({
-          bool: {
-            should
+        // if we have a model id and we aren't sort by recent, use neural search
+        if (process.env.OPENSEARCH_MODEL_ID && sort !== 'recent') {
+          termQueries = {
+            hybrid: {
+              queries: [
+                {
+                  bool: {
+                    should: [
+                      {
+                        neural: {
+                          title_embedding: {
+                            query_text: query,
+                            model_id: process.env.OPENSEARCH_MODEL_ID,
+                            k: decodedCursor.offset + LIMIT
+                          }
+                        }
+                      },
+                      {
+                        neural: {
+                          text_embedding: {
+                            query_text: query,
+                            model_id: process.env.OPENSEARCH_MODEL_ID,
+                            k: decodedCursor.offset + LIMIT
+                          }
+                        }
+                      }
+                    ]
+                  }
+                },
+                {
+                  bool: {
+                    should: termQueries
+                  }
+                }
+              ]
+            }
           }
-        })
+        }
+      } else {
+        termQueries = []
       }
 
       const whenRange = when === 'custom'
@@ -275,15 +315,23 @@ export default {
 
       try {
         sitems = await search.search({
-          index: 'item',
+          index: process.env.OPENSEARCH_INDEX,
           size: LIMIT,
+          _source: {
+            excludes: [
+              'text',
+              'text_embedding',
+              'title_embedding'
+            ]
+          },
           from: decodedCursor.offset,
           body: {
             query: {
               function_score: {
                 query: {
                   bool: {
-                    must: [
+                    ...(sort === 'recent' ? { must: termQueries } : { should: termQueries }),
+                    filter: [
                       ...whatArr,
                       me
                         ? {
@@ -302,9 +350,7 @@ export default {
                                 { match: { status: 'NOSATS' } }
                               ]
                             }
-                          }
-                    ],
-                    filter: [
+                          },
                       {
                         range:
                         {
diff --git a/api/search/index.js b/api/search/index.js
index 1fcc178f..ea2999f0 100644
--- a/api/search/index.js
+++ b/api/search/index.js
@@ -1,14 +1,12 @@
 import os from '@opensearch-project/opensearch'
 
-const options = process.env.NODE_ENV === 'development'
-  ? { node: process.env.OPENSEARCH_URL || 'http://localhost:9200' }
-  : {
-      node: process.env.OPENSEARCH_URL,
-      auth: {
-        username: process.env.OPENSEARCH_USERNAME,
-        password: process.env.OPENSEARCH_PASSWORD
-      }
-    }
+const options = {
+  node: process.env.OPENSEARCH_URL,
+  auth: {
+    username: process.env.OPENSEARCH_USERNAME,
+    password: process.env.OPENSEARCH_PASSWORD
+  }
+}
 
 global.os = global.os || new os.Client(options)
 
diff --git a/components/related.js b/components/related.js
index 2ccff80e..4408f59a 100644
--- a/components/related.js
+++ b/components/related.js
@@ -9,7 +9,7 @@ export default function Related ({ title, itemId }) {
   const variables = { title, id: itemId, limit: LIMIT }
   return (
     <AccordianItem
-      header={<div className='fw-bold'>related</div>}
+      header={<div className='fw-bold'>related posts</div>}
       body={
         <Items
           query={RELATED_ITEMS}
diff --git a/docs/semantic-search.md b/docs/semantic-search.md
new file mode 100644
index 00000000..70b98d23
--- /dev/null
+++ b/docs/semantic-search.md
@@ -0,0 +1,308 @@
+Getting semantic search setup in OpenSearch is a multistep process.
+
+### step 1: configure the ml plugin
+```json
+PUT _cluster/settings
+{
+  "persistent": {
+        "plugins.ml_commons.only_run_on_ml_node": "false",
+        "plugins.ml_commons.model_access_control_enabled": "true",
+        "plugins.ml_commons.native_memory_threshold": "99"
+      }
+}
+```
+
+### step 2: create a model group
+```json
+POST /_plugins/_ml/model_groups/_register
+{
+  "name": "local_model_group",
+  "description": "A model group for local models"
+}
+```
+
+### step 3: register a pretained model to the model group
+Importantly, we need to use a model that truncates input. Note the feature number of the model you're using, because we'll need to store those features. For example, the model below has 768 features.
+
+```json
+POST /_plugins/_ml/models/_register
+{
+  "name": "huggingface/sentence-transformers/all-mpnet-base-v2",
+  "version": "1.0.1",
+  "model_group_id": <model group id>,
+  "model_format": "TORCH_SCRIPT"
+}
+```
+
+### step 4: wait until the model registration is complete
+```json
+GET /_plugins/_ml/tasks/<task id from above>
+```
+
+### step 5: deploy the model
+Note the model id
+```json
+POST /_plugins/_ml/models/<model id>/_deploy
+```
+
+### step 6: create an ingest pipeline
+Most models choke on empty strings, so we remove them at an earlier stage in the pipeline. We also add the model to the pipeline which generates the embeddings.
+
+```json
+PUT /_ingest/pipeline/nlp-ingest-pipeline
+{
+  "description": "An NLP ingest pipeline",
+  "processors": [
+    {
+      "remove": {
+        "field": "text",
+        "if": "ctx?.text?.trim() == ''"
+      }
+    },
+    {
+      "remove": {
+        "field": "title",
+        "if": "ctx?.title?.trim() == ''"
+      }
+    },
+    {
+      "text_embedding": {
+        "model_id": "6whlBY0B2sj1ObjeeD5d",
+        "field_map": {
+          "text": "text_embedding",
+          "title": "title_embedding"
+        }
+      }
+    }
+  ]
+}
+```
+
+### step 7: create a new index with the knn_vector type
+We'll need to create mappings for the embeddings which is also a convenient time to specifiy special analyzers for the text and title fields.
+
+```json
+PUT /item-nlp
+{
+  "settings": {
+    "index.knn": true,
+    "default_pipeline": "nlp-ingest-pipeline"
+  },
+  "mappings": {
+    "properties": {
+      "text": {
+        "type": "text",
+        "analyzer": "english",
+        "fields": {
+          "keyword": {
+            "type": "keyword",
+            "ignore_above": 256
+          }
+        }
+      },
+      "title": {
+        "type": "text",
+        "analyzer": "english",
+        "fields": {
+          "keyword": {
+            "type": "keyword",
+            "ignore_above": 256
+          }
+        }
+      },
+      "title_embedding": {
+        "type": "knn_vector",
+        "dimension": 768,
+        "method": {
+          "engine": "lucene",
+          "space_type": "l2",
+          "name": "hnsw",
+          "parameters": {}
+        }
+      },
+      "text_embedding": {
+        "type": "knn_vector",
+        "dimension": 768,
+        "method": {
+          "engine": "lucene",
+          "space_type": "l2",
+          "name": "hnsw",
+          "parameters": {}
+        }
+      }
+    }
+  }
+}
+```
+
+### step 8: create a search pipeline for weighting term search and vector search
+```json
+PUT /_search/pipeline/nlp-search-pipeline
+{
+  "description": "Pre and post processor for hybrid search",
+  "request_processors": [
+    {
+      "neural_query_enricher" : {
+        "description": "Sets the default model ID at index and field levels (which doesn't actually work)",
+        "default_model_id": <model id>,
+      }
+    }
+  ],
+  "phase_results_processors": [
+    {
+      "normalization-processor": {
+        "normalization": {
+          "technique": "min_max"
+        },
+        "combination": {
+          "technique": "arithmetic_mean",
+          "parameters": {
+            "weights": [
+              0.7,
+              0.3
+            ]
+          }
+        }
+      }
+    }
+  ]
+}
+```
+
+### step 9: set it as the default search pipeline
+```json
+PUT /item-nlp/_settings
+{
+  "index.search.default_pipeline" : "nlp-search-pipeline"
+}
+```
+
+### step 10: reindex your data if you have data
+Warning: this take a very very long time.
+```json
+POST _reindex?wait_for_completion=false
+{
+  "source": {
+    "index": "item"
+  },
+  "dest": {
+    "index": "item-nlp"
+  }
+}
+```
+
+You can check the status of the reindexing with the following command:
+```json
+GET _tasks/<task id>
+```
+
+### step 11: search!
+```json
+GET /item-nlp/_search
+{
+  "_source": {
+    "excludes": [
+      "text_embedding",
+      "title_embedding"
+    ]
+  },
+  "size": 100,
+  "function_score": {
+    "query": {
+      "hybrid": {
+        "queries": [
+          {
+            "bool": {
+              "should": [
+                {
+                  "neural": {
+                    "title_embedding": {
+                      "query_text": "etf bitcoin",
+                      "model_id": <model id>,
+                      "k": 100
+                    }
+                  }
+                },
+                {
+                  "neural": {
+                    "text_embedding": {
+                      "query_text": "etf bitcoin",
+                      "model_id": <model id>,
+                      "k": 100
+                    }
+                  }
+                }
+              ],
+              "filter": [
+                {
+                  "range": {
+                    "wvotes": {
+                      "gte": 0
+                    }
+                  }
+                }
+              ]
+            }
+          },
+          {
+            "bool": {
+              "should": [
+                {
+                  "multi_match": {
+                    "query": "etf bitcoin",
+                    "type": "most_fields",
+                    "fields": [
+                      "title^1000",
+                      "text"
+                    ],
+                    "minimum_should_match": "100%",
+                    "boost": 10
+                  }
+                },
+                {
+                  "multi_match": {
+                    "query": "etf bitcoin",
+                    "type": "most_fields",
+                    "fields": [
+                      "title^1000",
+                      "text"
+                    ],
+                    "minimum_should_match": "60%",
+                    "boost": 1
+                  }
+                }
+              ],
+              "filter": [
+                {
+                  "range": {
+                    "wvotes": {
+                      "gte": 0
+                    }
+                  }
+                }
+              ]
+            }
+          }
+        ]
+      }
+    },
+    "functions": [
+      {
+        "field_value_factor": {
+          "field": "wvotes",
+          "modifier": "none",
+          "factor": 1.2
+        }
+      },
+      {
+        "field_value_factor": {
+          "field": "ncomments",
+          "modifier": "ln1p",
+          "factor": 1
+        }
+      }
+    ]
+  }
+}
+```
+
diff --git a/worker/search.js b/worker/search.js
index 55ed3993..fb0a4d5f 100644
--- a/worker/search.js
+++ b/worker/search.js
@@ -63,7 +63,7 @@ async function _indexItem (item, { models }) {
   try {
     await search.index({
       id: item.id,
-      index: 'item',
+      index: process.env.OPENSEARCH_INDEX,
       version: new Date(item.updatedAt).getTime(),
       versionType: 'external_gte',
       body: itemcp