NLP startup script + opensearch fixes (#2070)

* fix opensearch startup * nlp setup script * nlp setup documentation * move script to ./scripts and update docs --------- Co-authored-by: k00b <k00b@stacker.news>
2025-04-07 15:08:37 -07:00 · 2025-04-07 15:08:37 -07:00 · e4a2228d7c
commit e4a2228d7c
parent f43a522a87
5 changed files with 484 additions and 19 deletions
--- a/README.md
+++ b/README.md
@ -131,6 +131,24 @@ services:

 You can read more about [docker compose override files](https://docs.docker.com/compose/multiple-compose-files/merge/).

+#### Enabling semantic search
+
+To enable semantic search that uses text embeddings, run `./scripts/nlp-setup`.
+
+Before running `./scripts/nlp-setup`, ensure the following are true:
+
+- search is enabled in `COMPOSE_PROFILES`:
+
+    ```.env
+    COMPOSE_PROFILES=...,search,...
+    ```
+- The default opensearch index (default name=`item`) is created and done indexing. This should happen the first time you run `./sndev start`, but it may take a few minutes for indexing to complete.
+
+After `nlp-setup` is done, restart your containers to enable semantic search:
+
+```
+> ./sndev restart
+```


 <br>
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -176,31 +176,16 @@ services:
      - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_PASSWORD}
      - plugins.security.disabled=true
      - discovery.type=single-node
+      - "_JAVA_OPTIONS=-Xms2g -Xmx2g -XX:UseSVE=0"
    ports:
      - 9200:9200 # REST API
      - 9600:9600 # Performance Analyzer
    volumes:
      - os:/usr/share/opensearch/data
+      - ./docker/opensearch/init-opensearch.sh:/usr/share/opensearch/init-opensearch.sh
    labels:
      CONNECT: "localhost:9200"
-    command: >
-      bash -c '
-        set -m
-        /usr/share/opensearch/opensearch-docker-entrypoint.sh &
-        until curl -sS "http://localhost:9200/_cat/health?h=status" -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} | grep -q "green\|yellow"; do
-          echo "Waiting for OpenSearch to start..."
-          sleep 1
-        done
-        echo "OpenSearch started."
-        curl \
-          -H "Content-Type: application/json" \
-          -X PUT \
-          -d '{"mappings":{"properties":{"text":{"type":"text","analyzer":"english","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"title":{"type":"text","analyzer":"english","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}}}' \
-          "http://localhost:9200/item" \
-          -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
-        echo "OpenSearch index created."
-        fg
-      '
+    command: ["bash", "/usr/share/opensearch/init-opensearch.sh"]
    cpu_shares: "${CPU_SHARES_LOW}"
  os-dashboard:
    image: opensearchproject/opensearch-dashboards:2.17.0
--- a/docker/opensearch/init-opensearch.sh
+++ b/docker/opensearch/init-opensearch.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+
+set -m
+
+/usr/share/opensearch/opensearch-docker-entrypoint.sh &
+
+# ---- Wait for OpenSearch to start
+
+until curl -sS "http://localhost:9200/_cat/health?h=status" -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} | grep -q "green\|yellow"; do
+  echo "Waiting for OpenSearch to start..."
+  sleep 1
+done
+
+# ---- If index doesn't exist, create it with default settings
+
+index_exists=$(curl -s -o /dev/null -w "%{http_code}" -I "http://localhost:9200/$OPENSEARCH_INDEX")
+
+if [ "$index_exists" -eq 200 ]; then
+  echo "OpenSearch index $OPENSEARCH_INDEX already exists."
+else
+  curl \
+    -H "Content-Type: application/json" \
+    -X PUT \
+    -d '{
+      "mappings": {
+        "properties": {
+          "text": {
+            "type": "text",
+            "analyzer": "english",
+            "fields": {"keyword":{"type":"keyword","ignore_above":256}}
+          },
+          "title": {
+            "type": "text",
+            "analyzer": "english",
+            "fields": {"keyword":{"type":"keyword","ignore_above":256}}
+          }}}}' \
+    "http://localhost:9200/$OPENSEARCH_INDEX" \
+    -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD}
+  echo ""
+  echo "OpenSearch index $OPENSEARCH_INDEX created."
+fi
+
+fg
--- a/docs/dev/semantic-search.md
+++ b/docs/dev/semantic-search.md
@ -1,4 +1,26 @@
-Getting semantic search setup in OpenSearch is currently a multistep, manual process. To configure semantic search, enter the following commands into OpenSearch's REST API. You can do this in Dev Tools in the OpenSearch Dashboard (after starting your SN dev environment, point your browser to localhost:5601). You can also use CURL to send these commands to localhost:9200.
+## Automated setup
+
+To enable semantic search that uses text embeddings, run `./scripts/nlp-setup`.
+
+Before running `./scripts/nlp-setup`, ensure the following are true:
+
+- search is enabled in `COMPOSE_PROFILES`:
+
+    ```.env
+    COMPOSE_PROFILES=...,search,...
+    ```
+- The default opensearch index (default name=`item`) is created and done indexing. This should happen the first time you run `./sndev start`, but it may take a few minutes for indexing to complete.
+
+After `nlp-setup` is done, restart your containers to enable semantic search:
+
+```
+> ./sndev restart
+```
+
+
+## Manual setup
+
+You can also set up and configure semantic search manually. To do so, enter the following commands into OpenSearch's REST API. You can do this in Dev Tools in the OpenSearch Dashboard (after starting your SN dev environment, point your browser to localhost:5601). You can also use CURL to send these commands to localhost:9200.

 ### step 1: configure the ml plugin
 ```json
--- a/scripts/nlp-setup
+++ b/scripts/nlp-setup
@ -0,0 +1,397 @@
+#!/bin/bash
+
+# ----------------------------------------------
+# usage() function - prints help/usage message
+# ----------------------------------------------
+usage() {
+  cat <<EOF
+Usage: nlp-setup [OPTIONS] [KW_INDEX] [NLP_INDEX]
+
+This script will setup a model and index in opensearch to enable semantic
+search. It assumes the existence of a default, keyword based index that has
+already been fully indexed. It then creates a new index for semantic search
+and re-indexes it using the items from the old index.
+
+It then sets:
+
+OPENSEARCH_INDEX=<NLP_INDEX>
+OPENSEARCH_MODEL_ID=<MODEL_ID>
+
+in .env.local.
+
+After running this script, you will need to remove and rebuild your containers
+using "sndev stop" and "sndev start", in order to enable semantic search.
+
+Options:
+  -h, --help     Display this help message and exit
+
+Arguments:
+  KW_INDEX       The name of the keyword index (default: item)
+  NLP_INDEX      The name of the semantic index (default: item-nlp)
+
+EOF
+}
+
+# -------------------------------------------------
+# Check if user requested help via -h or --help
+# -------------------------------------------------
+for arg in "$@"; do
+  case "$arg" in
+    -h|--help)
+      usage
+      exit 0
+      ;;
+  esac
+done
+
+# ---------------------------------------
+# Set defaults if not provided
+# ---------------------------------------
+KW_INDEX="${1:-item}"
+NLP_INDEX="${2:-item-nlp}"
+
+# ---------------------------------------
+# Main script
+# ---------------------------------------
+OS_URL="http://localhost:9200"
+MODEL_NAME="huggingface/sentence-transformers/all-mpnet-base-v2"
+
+set -e
+
+# Ensure that search is in COMPOSE_PROFILES
+COMPOSE_PROFILES=$(docker exec app printenv COMPOSE_PROFILES)
+if [[ ! "$COMPOSE_PROFILES" == *"search"* ]]; then
+  cat <<EOF
+Please ensure that COMPOSE_PROFILES contains search, then restart the
+containers and try again.
+EOF
+  exit 1
+fi
+
+# Ensure that KW_INDEX is reachable
+echo -ne "Checking that index $KW_INDEX is reachable... "
+response=$(curl -s -o /dev/null -w "%{http_code}" -I "$OS_URL/$KW_INDEX")
+if [ "$response" -eq 200 ]; then
+  echo "yes."
+else
+  echo "no."
+  cat <<EOF
+An index named $KW_INDEX must exist in your stackernews_os volume.
+
+If you just started up the container, wait a while and try again.
+
+Otherwise, you may need to delete and rebuild the opensearch container and
+stackernews_os volume. Check the value of OPENSEARCH_INDEX in your env
+variables.
+EOF
+  exit 1
+fi
+
+# Check if KW_INDEX is still indexing
+kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
+sleep 2
+kw_count_2=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
+if [ "$kw_count_2" != "$kw_count" ] || [ "$kw_count_2" -lt 5000 ]; then
+  echo "It appears that $KW_INDEX is not done indexing."
+  echo "Please wait until it finishes indexing, then try again."
+  exit 0
+fi
+
+# Configure the ML plugin
+echo -ne "Configuring the ML plugin... "
+curl \
+  -s -o /dev/null \
+  -X PUT "$OS_URL/_cluster/settings" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "persistent": {
+     "plugins.ml_commons.only_run_on_ml_node": "false",
+     "plugins.ml_commons.model_access_control_enabled": "true",
+     "plugins.ml_commons.native_memory_threshold": "99"
+    }}'
+echo "done."
+
+# Check if a local model group is registered and register if not
+echo -ne "Checking if local model group is registered... "
+response=$(curl -s \
+  -X POST "$OS_URL/_plugins/_ml/model_groups/_search" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "query": {
+      "term": {
+        "name.keyword": "local_model_group"
+      }
+    }}')
+exists=$(echo "$response" | jq -r '.hits.total.value')
+if [ "$exists" -gt 0 ]; then
+  echo "yes."
+  model_group_id=$(echo "$response" | jq -r '.hits.hits[0]._id')
+else
+  echo "no."
+  echo "Creating local model group."
+  model_group_id=$(curl -s \
+    -X POST "$OS_URL/_plugins/_ml/model_groups/_register" \
+    -H "Content-Type: application/json" \
+    -d '{
+         "name": "local_model_group",
+         "description": "A model group for local models"
+       }' | jq -r '.model_group_id')
+fi
+echo "model_group_id=$model_group_id"
+
+# Check if the model is registered and register if not
+echo -ne "Checking if the NLP model is registered... "
+response=$(curl \
+  -s -X POST "$OS_URL/_plugins/_ml/models/_search" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "query": {
+      "bool": {
+        "must": [
+          {"term": {"name.keyword": "'"$MODEL_NAME"'"}},
+          {"term": {"model_group_id": "'"$model_group_id"'"}}
+        ]
+      }
+    }}')
+exists=$(echo "$response" | jq -r '.hits.total.value')
+if [ "$exists" -gt 0 ]; then
+  model_id=$(echo "$response" | jq -r '.hits.hits[0]._id')
+  echo "yes."
+else
+  echo "no."
+  # Create the model
+  task_id=$(curl \
+    -s -X POST "$OS_URL/_plugins/_ml/models/_register" \
+    -H "Content-Type: application/json" \
+    -d '{
+      "name": "'"$MODEL_NAME"'",
+      "version": "1.0.1",
+      "model_group_id": "'"$model_group_id"'",
+      "model_format": "TORCH_SCRIPT"
+    }' | jq -r '.task_id')
+  echo "Downloading and registering the model (task_id=$task_id)"
+  state="INCOMPLETE"
+  start_time=$(date +%s)
+  while [ $state != "COMPLETED" ]
+  do
+    response=$(curl \
+      -s -X GET "$OS_URL/_plugins/_ml/tasks/$task_id" \
+      -H "Content-Type: application/json")
+    state=$(echo "$response" | jq -r '.state')
+    elapsed=$(( $(date +%s) - start_time ))
+    echo -ne "\rPlease wait (~1 min)... ${elapsed}s "
+    sleep 1
+  done
+  echo "done."
+  model_id=$(echo "$response" | jq -r '.model_id')
+fi
+echo "model_id=$model_id"
+
+# Check if the model is deployed and deploy if not
+echo -ne "Checking if the model is deployed... "
+response=$(curl \
+  -s -X GET "$OS_URL/_plugins/_ml/models/$model_id" \
+  -H "Content-Type: application/json")
+state=$(echo "$response" | jq -r '.model_state')
+if [ $state == "DEPLOYED" ]; then
+  echo "yes."
+else
+  echo "no."
+  task_id=$(curl -s \
+    -X POST "$OS_URL/_plugins/_ml/models/$model_id/_deploy" \
+    -H "Content-Type: application/json" | jq -r '.task_id')
+  echo "Deploying the model (task_id=$task_id)"
+  start_time=$(date +%s)
+  while [ $state != "DEPLOYED" ]
+  do
+    response=$(curl \
+      -s -X GET "$OS_URL/_plugins/_ml/models/$model_id" \
+      -H "Content-Type: application/json")
+    state=$(echo "$response" | jq -r '.model_state')
+    if [ $state == "DEPLOY_FAILED" ]; then
+      echo "Deploy failed."
+      echo "Try again later."
+      exit 1
+    fi
+    elapsed=$(( $(date +%s) - start_time ))
+    echo -ne "\rPlease wait... ${elapsed}s "
+    sleep 1
+  done
+  echo "done."
+fi
+
+# Create the ingest pipeline
+echo -ne "Creating the ingest pipeline... "
+curl -s -o /dev/null \
+  -X PUT "$OS_URL/_ingest/pipeline/nlp-ingest-pipeline" \
+  -H "Content-Type: application/json" \
+  -d "{
+    \"description\": \"An NLP ingest pipeline\",
+    \"processors\": [
+      {
+        \"remove\": {
+          \"field\": \"text\",
+          \"if\": \"ctx?.text?.trim() == ''\"
+        }
+      },
+      {
+        \"remove\": {
+          \"field\": \"title\",
+          \"if\": \"ctx?.title?.trim() == ''\"
+        }
+      },
+      {
+        \"text_embedding\": {
+          \"model_id\": \"$model_id\",
+          \"field_map\": {
+            \"text\": \"text_embedding\",
+            \"title\": \"title_embedding\"
+          }
+        }}]}"
+echo "done."
+
+# Create the hybrid search pipeline
+echo -ne "Creating the hybrid search pipeline... "
+curl -s -o /dev/null \
+  -X PUT "$OS_URL/_search/pipeline/nlp-search-pipeline" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "description": "Pre and post processor for hybrid search",
+    "request_processors": [{
+      "neural_query_enricher": {
+        "description": "Sets the default model ID at index and field levels (which doesnt actually work)",
+        "default_model_id": "'"$model_id"'"
+      }
+    }],
+    "phase_results_processors": [{
+      "normalization-processor": {
+        "normalization": {"technique": "min_max"},
+        "combination": {
+          "technique": "arithmetic_mean",
+          "parameters": {"weights": [0.7,0.3]}
+        }
+      }
+    }]}'
+echo "done."
+
+# Check if NLP_INDEX exists and create if note
+echo -ne "Checking if index $NLP_INDEX exists... "
+response=$(curl -s -o /dev/null -w "%{http_code}" -I "$OS_URL/$NLP_INDEX")
+if [ "$response" -eq 200 ]; then
+  echo "yes."
+else
+  echo "no."
+  echo -ne "Creating NLP index... "
+  curl -s -o /dev/null \
+    -X PUT "$OS_URL/$NLP_INDEX" \
+    -H "Content-Type: application/json" \
+    -d '{
+     "settings": {
+       "index.knn": true,
+       "default_pipeline": "nlp-ingest-pipeline"
+     },
+     "mappings": {
+       "properties": {
+         "text": {
+           "type": "text",
+           "analyzer": "english",
+           "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}
+         },
+         "title": {
+           "type": "text",
+           "analyzer": "english",
+           "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}
+         },
+         "title_embedding": {
+           "type": "knn_vector",
+           "dimension": 768,
+           "method": {
+             "engine": "lucene",
+             "space_type": "l2",
+             "name": "hnsw",
+             "parameters": {}
+           }
+         },
+         "text_embedding": {
+           "type": "knn_vector",
+           "dimension": 768,
+           "method": {
+             "engine": "lucene",
+             "space_type": "l2",
+             "name": "hnsw",
+             "parameters": {}
+           }
+          }
+        }
+      }}'
+  echo "done."
+fi
+
+# Ask user if they want to begin reindexing
+kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
+nlp_count=$(curl -s -X GET "$OS_URL/$NLP_INDEX/_count" | jq -r '.count')
+
+echo "Ready to begin re-indexing. It may take a while (~10 min)."
+echo "Indexed items in $KW_INDEX: $kw_count"
+echo "Indexed items in $NLP_INDEX: $nlp_count"
+read -p "Do you want to begin re-indexing? (y/n) " response
+if [[ "$response" =~ ^[Nn]$ ]]; then
+  echo "If you need to re-index $NLP_INDEX, run this script again."
+  exit 0
+fi
+
+# Re-index the data
+task_id=$(curl -s \
+  -X POST "$OS_URL/_reindex?wait_for_completion=false" \
+  -H "Content-Type: application/json" \
+  -d '{
+       "source": {
+         "index": "'"$KW_INDEX"'"
+       },
+       "dest": {
+         "index": "'"$NLP_INDEX"'"
+       }
+     }' | jq -r '.task')
+echo "Re-indexing the data (task_id=$task_id)"
+completed="false"
+start_time=$(date +%s)
+while [ $completed != "true" ]
+do
+  completed=$(curl -s -X GET "$OS_URL/_tasks/$task_id" \
+    -H "Content-Type: application/json" | jq -r '.completed')
+  elapsed=$(( $(date +%s) - start_time ))
+  echo -ne "\rThis may take a while (~10 min)... ${elapsed}s "
+  sleep 1
+done
+echo "done."
+
+kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
+nlp_count=$(curl -s -X GET "$OS_URL/$NLP_INDEX/_count" | jq -r '.count')
+echo "Indexed items in $KW_INDEX: $kw_count"
+echo "Indexed items in $NLP_INDEX: $nlp_count"
+
+# Update .env.local
+echo "Updating .env.local with:"
+echo "OPENSEARCH_INDEX=$NLP_INDEX"
+echo "OPENSEARCH_MODEL_ID=$model_id"
+if grep -q "^OPENSEARCH_INDEX=" ".env.local"; then
+  sed -i '' "s|^OPENSEARCH_INDEX=.*|OPENSEARCH_INDEX=$NLP_INDEX|" ".env.local"
+else
+  echo "OPENSEARCH_INDEX=item-nlp" >> ".env.local"
+fi
+
+if grep -q "^OPENSEARCH_MODEL_ID=" ".env.local"; then
+  sed -i '' "s|^OPENSEARCH_MODEL_ID=.*|OPENSEARCH_MODEL_ID=$model_id|" ".env.local"
+else
+  echo "OPENSEARCH_MODEL_ID=$model_id" >> ".env.local"
+fi
+echo "done."
+
+cat <<EOF
+To enable semantic search, you should restart your containers:
+
+> ./sndev restart
+
+Do NOT rebuild the stackernews_os volume or you will have to run this
+process again.
+EOF