From e4a2228d7c9577d25d062193413a00d783111916 Mon Sep 17 00:00:00 2001 From: Edward Kung Date: Mon, 7 Apr 2025 15:08:37 -0700 Subject: [PATCH] NLP startup script + opensearch fixes (#2070) * fix opensearch startup * nlp setup script * nlp setup documentation * move script to ./scripts and update docs --------- Co-authored-by: k00b --- README.md | 18 ++ docker-compose.yml | 21 +- docker/opensearch/init-opensearch.sh | 43 +++ docs/dev/semantic-search.md | 24 +- scripts/nlp-setup | 397 +++++++++++++++++++++++++++ 5 files changed, 484 insertions(+), 19 deletions(-) create mode 100755 docker/opensearch/init-opensearch.sh create mode 100755 scripts/nlp-setup diff --git a/README.md b/README.md index 86e69485..f45b481c 100644 --- a/README.md +++ b/README.md @@ -131,6 +131,24 @@ services: You can read more about [docker compose override files](https://docs.docker.com/compose/multiple-compose-files/merge/). +#### Enabling semantic search + +To enable semantic search that uses text embeddings, run `./scripts/nlp-setup`. + +Before running `./scripts/nlp-setup`, ensure the following are true: + +- search is enabled in `COMPOSE_PROFILES`: + + ```.env + COMPOSE_PROFILES=...,search,... + ``` +- The default opensearch index (default name=`item`) is created and done indexing. This should happen the first time you run `./sndev start`, but it may take a few minutes for indexing to complete. + +After `nlp-setup` is done, restart your containers to enable semantic search: + +``` +> ./sndev restart +```
diff --git a/docker-compose.yml b/docker-compose.yml index ad630e6a..60d870ce 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -176,31 +176,16 @@ services: - OPENSEARCH_INITIAL_ADMIN_PASSWORD=${OPENSEARCH_PASSWORD} - plugins.security.disabled=true - discovery.type=single-node + - "_JAVA_OPTIONS=-Xms2g -Xmx2g -XX:UseSVE=0" ports: - 9200:9200 # REST API - 9600:9600 # Performance Analyzer volumes: - os:/usr/share/opensearch/data + - ./docker/opensearch/init-opensearch.sh:/usr/share/opensearch/init-opensearch.sh labels: CONNECT: "localhost:9200" - command: > - bash -c ' - set -m - /usr/share/opensearch/opensearch-docker-entrypoint.sh & - until curl -sS "http://localhost:9200/_cat/health?h=status" -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} | grep -q "green\|yellow"; do - echo "Waiting for OpenSearch to start..." - sleep 1 - done - echo "OpenSearch started." - curl \ - -H "Content-Type: application/json" \ - -X PUT \ - -d '{"mappings":{"properties":{"text":{"type":"text","analyzer":"english","fields":{"keyword":{"type":"keyword","ignore_above":256}}},"title":{"type":"text","analyzer":"english","fields":{"keyword":{"type":"keyword","ignore_above":256}}}}}}' \ - "http://localhost:9200/item" \ - -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} - echo "OpenSearch index created." - fg - ' + command: ["bash", "/usr/share/opensearch/init-opensearch.sh"] cpu_shares: "${CPU_SHARES_LOW}" os-dashboard: image: opensearchproject/opensearch-dashboards:2.17.0 diff --git a/docker/opensearch/init-opensearch.sh b/docker/opensearch/init-opensearch.sh new file mode 100755 index 00000000..16267260 --- /dev/null +++ b/docker/opensearch/init-opensearch.sh @@ -0,0 +1,43 @@ +#!/bin/bash + +set -m + +/usr/share/opensearch/opensearch-docker-entrypoint.sh & + +# ---- Wait for OpenSearch to start + +until curl -sS "http://localhost:9200/_cat/health?h=status" -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} | grep -q "green\|yellow"; do + echo "Waiting for OpenSearch to start..." + sleep 1 +done + +# ---- If index doesn't exist, create it with default settings + +index_exists=$(curl -s -o /dev/null -w "%{http_code}" -I "http://localhost:9200/$OPENSEARCH_INDEX") + +if [ "$index_exists" -eq 200 ]; then + echo "OpenSearch index $OPENSEARCH_INDEX already exists." +else + curl \ + -H "Content-Type: application/json" \ + -X PUT \ + -d '{ + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "english", + "fields": {"keyword":{"type":"keyword","ignore_above":256}} + }, + "title": { + "type": "text", + "analyzer": "english", + "fields": {"keyword":{"type":"keyword","ignore_above":256}} + }}}}' \ + "http://localhost:9200/$OPENSEARCH_INDEX" \ + -ku admin:${OPENSEARCH_INITIAL_ADMIN_PASSWORD} + echo "" + echo "OpenSearch index $OPENSEARCH_INDEX created." +fi + +fg \ No newline at end of file diff --git a/docs/dev/semantic-search.md b/docs/dev/semantic-search.md index 29d2d0b1..33ed4bae 100644 --- a/docs/dev/semantic-search.md +++ b/docs/dev/semantic-search.md @@ -1,4 +1,26 @@ -Getting semantic search setup in OpenSearch is currently a multistep, manual process. To configure semantic search, enter the following commands into OpenSearch's REST API. You can do this in Dev Tools in the OpenSearch Dashboard (after starting your SN dev environment, point your browser to localhost:5601). You can also use CURL to send these commands to localhost:9200. +## Automated setup + +To enable semantic search that uses text embeddings, run `./scripts/nlp-setup`. + +Before running `./scripts/nlp-setup`, ensure the following are true: + +- search is enabled in `COMPOSE_PROFILES`: + + ```.env + COMPOSE_PROFILES=...,search,... + ``` +- The default opensearch index (default name=`item`) is created and done indexing. This should happen the first time you run `./sndev start`, but it may take a few minutes for indexing to complete. + +After `nlp-setup` is done, restart your containers to enable semantic search: + +``` +> ./sndev restart +``` + + +## Manual setup + +You can also set up and configure semantic search manually. To do so, enter the following commands into OpenSearch's REST API. You can do this in Dev Tools in the OpenSearch Dashboard (after starting your SN dev environment, point your browser to localhost:5601). You can also use CURL to send these commands to localhost:9200. ### step 1: configure the ml plugin ```json diff --git a/scripts/nlp-setup b/scripts/nlp-setup new file mode 100755 index 00000000..f9d75334 --- /dev/null +++ b/scripts/nlp-setup @@ -0,0 +1,397 @@ +#!/bin/bash + +# ---------------------------------------------- +# usage() function - prints help/usage message +# ---------------------------------------------- +usage() { + cat < +OPENSEARCH_MODEL_ID= + +in .env.local. + +After running this script, you will need to remove and rebuild your containers +using "sndev stop" and "sndev start", in order to enable semantic search. + +Options: + -h, --help Display this help message and exit + +Arguments: + KW_INDEX The name of the keyword index (default: item) + NLP_INDEX The name of the semantic index (default: item-nlp) + +EOF +} + +# ------------------------------------------------- +# Check if user requested help via -h or --help +# ------------------------------------------------- +for arg in "$@"; do + case "$arg" in + -h|--help) + usage + exit 0 + ;; + esac +done + +# --------------------------------------- +# Set defaults if not provided +# --------------------------------------- +KW_INDEX="${1:-item}" +NLP_INDEX="${2:-item-nlp}" + +# --------------------------------------- +# Main script +# --------------------------------------- +OS_URL="http://localhost:9200" +MODEL_NAME="huggingface/sentence-transformers/all-mpnet-base-v2" + +set -e + +# Ensure that search is in COMPOSE_PROFILES +COMPOSE_PROFILES=$(docker exec app printenv COMPOSE_PROFILES) +if [[ ! "$COMPOSE_PROFILES" == *"search"* ]]; then + cat <> ".env.local" +fi + +if grep -q "^OPENSEARCH_MODEL_ID=" ".env.local"; then + sed -i '' "s|^OPENSEARCH_MODEL_ID=.*|OPENSEARCH_MODEL_ID=$model_id|" ".env.local" +else + echo "OPENSEARCH_MODEL_ID=$model_id" >> ".env.local" +fi +echo "done." + +cat < ./sndev restart + +Do NOT rebuild the stackernews_os volume or you will have to run this +process again. +EOF