* feat: add exact search for quoted phrases/words * feat: get some highlighting for exact search * feat: Add exact search for title and text fields in OpenSearch * simplify and make it work with nlp script --------- Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com> Co-authored-by: k00b <k00b@stacker.news>
		
			
				
	
	
		
			420 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			420 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
#!/bin/bash
 | 
						|
 | 
						|
# ----------------------------------------------
 | 
						|
# usage() function - prints help/usage message
 | 
						|
# ----------------------------------------------
 | 
						|
usage() {
 | 
						|
  cat <<EOF
 | 
						|
Usage: nlp-setup [OPTIONS] [KW_INDEX] [NLP_INDEX]
 | 
						|
 | 
						|
This script will setup a model and index in opensearch to enable semantic
 | 
						|
search. It assumes the existence of a default, keyword based index that has
 | 
						|
already been fully indexed. It then creates a new index for semantic search
 | 
						|
and re-indexes it using the items from the old index.
 | 
						|
 | 
						|
It then sets:
 | 
						|
 | 
						|
OPENSEARCH_INDEX=<NLP_INDEX>
 | 
						|
OPENSEARCH_MODEL_ID=<MODEL_ID>
 | 
						|
 | 
						|
in .env.local.
 | 
						|
 | 
						|
After running this script, you will need to remove and rebuild your containers
 | 
						|
using "sndev stop" and "sndev start", in order to enable semantic search.
 | 
						|
 | 
						|
Options:
 | 
						|
  -h, --help     Display this help message and exit
 | 
						|
 | 
						|
Arguments:
 | 
						|
  KW_INDEX       The name of the keyword index (default: item)
 | 
						|
  NLP_INDEX      The name of the semantic index (default: item-nlp)
 | 
						|
 | 
						|
EOF
 | 
						|
}
 | 
						|
 | 
						|
# -------------------------------------------------
 | 
						|
# Check if user requested help via -h or --help
 | 
						|
# -------------------------------------------------
 | 
						|
for arg in "$@"; do
 | 
						|
  case "$arg" in
 | 
						|
    -h|--help)
 | 
						|
      usage
 | 
						|
      exit 0
 | 
						|
      ;;
 | 
						|
  esac
 | 
						|
done
 | 
						|
 | 
						|
# ---------------------------------------
 | 
						|
# Set defaults if not provided
 | 
						|
# ---------------------------------------
 | 
						|
KW_INDEX="${1:-item}"
 | 
						|
NLP_INDEX="${2:-item-nlp}"
 | 
						|
 | 
						|
# ---------------------------------------
 | 
						|
# Main script
 | 
						|
# ---------------------------------------
 | 
						|
OS_URL="http://localhost:9200"
 | 
						|
MODEL_NAME="huggingface/sentence-transformers/all-mpnet-base-v2"
 | 
						|
 | 
						|
set -e
 | 
						|
 | 
						|
# Ensure that search is in COMPOSE_PROFILES
 | 
						|
COMPOSE_PROFILES=$(docker exec app printenv COMPOSE_PROFILES)
 | 
						|
if [[ ! "$COMPOSE_PROFILES" == *"search"* ]]; then
 | 
						|
  cat <<EOF
 | 
						|
Please ensure that COMPOSE_PROFILES contains search, then restart the
 | 
						|
containers and try again.
 | 
						|
EOF
 | 
						|
  exit 1
 | 
						|
fi
 | 
						|
 | 
						|
# Ensure that KW_INDEX is reachable
 | 
						|
echo -ne "Checking that index $KW_INDEX is reachable... "
 | 
						|
response=$(curl -s -o /dev/null -w "%{http_code}" -I "$OS_URL/$KW_INDEX")
 | 
						|
if [ "$response" -eq 200 ]; then
 | 
						|
  echo "yes."
 | 
						|
else
 | 
						|
  echo "no."
 | 
						|
  cat <<EOF
 | 
						|
An index named $KW_INDEX must exist in your stackernews_os volume.
 | 
						|
 | 
						|
If you just started up the container, wait a while and try again.
 | 
						|
 | 
						|
Otherwise, you may need to delete and rebuild the opensearch container and
 | 
						|
stackernews_os volume. Check the value of OPENSEARCH_INDEX in your env
 | 
						|
variables.
 | 
						|
EOF
 | 
						|
  exit 1
 | 
						|
fi
 | 
						|
 | 
						|
# Check if KW_INDEX is still indexing
 | 
						|
kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
 | 
						|
sleep 2
 | 
						|
kw_count_2=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
 | 
						|
if [ "$kw_count_2" != "$kw_count" ] || [ "$kw_count_2" -lt 5000 ]; then
 | 
						|
  echo "It appears that $KW_INDEX is not done indexing."
 | 
						|
  echo "Please wait until it finishes indexing, then try again."
 | 
						|
  exit 0
 | 
						|
fi
 | 
						|
 | 
						|
# Configure the ML plugin
 | 
						|
echo -ne "Configuring the ML plugin... "
 | 
						|
curl \
 | 
						|
  -s -o /dev/null \
 | 
						|
  -X PUT "$OS_URL/_cluster/settings" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d '{
 | 
						|
    "persistent": {
 | 
						|
     "plugins.ml_commons.only_run_on_ml_node": "false",
 | 
						|
     "plugins.ml_commons.model_access_control_enabled": "true",
 | 
						|
     "plugins.ml_commons.native_memory_threshold": "99"
 | 
						|
    }}'
 | 
						|
echo "done."
 | 
						|
 | 
						|
# Check if a local model group is registered and register if not
 | 
						|
echo -ne "Checking if local model group is registered... "
 | 
						|
response=$(curl -s \
 | 
						|
  -X POST "$OS_URL/_plugins/_ml/model_groups/_search" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d '{
 | 
						|
    "query": {
 | 
						|
      "term": {
 | 
						|
        "name.keyword": "local_model_group"
 | 
						|
      }
 | 
						|
    }}')
 | 
						|
exists=$(echo "$response" | jq -r '.hits.total.value')
 | 
						|
if [ "$exists" -gt 0 ]; then
 | 
						|
  echo "yes."
 | 
						|
  model_group_id=$(echo "$response" | jq -r '.hits.hits[0]._id')
 | 
						|
else
 | 
						|
  echo "no."
 | 
						|
  echo "Creating local model group."
 | 
						|
  model_group_id=$(curl -s \
 | 
						|
    -X POST "$OS_URL/_plugins/_ml/model_groups/_register" \
 | 
						|
    -H "Content-Type: application/json" \
 | 
						|
    -d '{
 | 
						|
         "name": "local_model_group",
 | 
						|
         "description": "A model group for local models"
 | 
						|
       }' | jq -r '.model_group_id')
 | 
						|
fi
 | 
						|
echo "model_group_id=$model_group_id"
 | 
						|
 | 
						|
# Check if the model is registered and register if not
 | 
						|
echo -ne "Checking if the NLP model is registered... "
 | 
						|
response=$(curl \
 | 
						|
  -s -X POST "$OS_URL/_plugins/_ml/models/_search" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d '{
 | 
						|
    "query": {
 | 
						|
      "bool": {
 | 
						|
        "must": [
 | 
						|
          {"term": {"name.keyword": "'"$MODEL_NAME"'"}},
 | 
						|
          {"term": {"model_group_id": "'"$model_group_id"'"}}
 | 
						|
        ]
 | 
						|
      }
 | 
						|
    }}')
 | 
						|
exists=$(echo "$response" | jq -r '.hits.total.value')
 | 
						|
if [ "$exists" -gt 0 ]; then
 | 
						|
  model_id=$(echo "$response" | jq -r '.hits.hits[0]._id')
 | 
						|
  echo "yes."
 | 
						|
else
 | 
						|
  echo "no."
 | 
						|
  # Create the model
 | 
						|
  task_id=$(curl \
 | 
						|
    -s -X POST "$OS_URL/_plugins/_ml/models/_register" \
 | 
						|
    -H "Content-Type: application/json" \
 | 
						|
    -d '{
 | 
						|
      "name": "'"$MODEL_NAME"'",
 | 
						|
      "version": "1.0.1",
 | 
						|
      "model_group_id": "'"$model_group_id"'",
 | 
						|
      "model_format": "TORCH_SCRIPT"
 | 
						|
    }' | jq -r '.task_id')
 | 
						|
  echo "Downloading and registering the model (task_id=$task_id)"
 | 
						|
  state="INCOMPLETE"
 | 
						|
  start_time=$(date +%s)
 | 
						|
  while [ $state != "COMPLETED" ]
 | 
						|
  do
 | 
						|
    response=$(curl \
 | 
						|
      -s -X GET "$OS_URL/_plugins/_ml/tasks/$task_id" \
 | 
						|
      -H "Content-Type: application/json")
 | 
						|
    state=$(echo "$response" | jq -r '.state')
 | 
						|
    elapsed=$(( $(date +%s) - start_time ))
 | 
						|
    echo -ne "\rPlease wait (~1 min)... ${elapsed}s "
 | 
						|
    sleep 1
 | 
						|
  done
 | 
						|
  echo "done."
 | 
						|
  model_id=$(echo "$response" | jq -r '.model_id')
 | 
						|
fi
 | 
						|
echo "model_id=$model_id"
 | 
						|
 | 
						|
# Check if the model is deployed and deploy if not
 | 
						|
echo -ne "Checking if the model is deployed... "
 | 
						|
response=$(curl \
 | 
						|
  -s -X GET "$OS_URL/_plugins/_ml/models/$model_id" \
 | 
						|
  -H "Content-Type: application/json")
 | 
						|
state=$(echo "$response" | jq -r '.model_state')
 | 
						|
if [ $state == "DEPLOYED" ]; then
 | 
						|
  echo "yes."
 | 
						|
else
 | 
						|
  echo "no."
 | 
						|
  task_id=$(curl -s \
 | 
						|
    -X POST "$OS_URL/_plugins/_ml/models/$model_id/_deploy" \
 | 
						|
    -H "Content-Type: application/json" | jq -r '.task_id')
 | 
						|
  echo "Deploying the model (task_id=$task_id)"
 | 
						|
  start_time=$(date +%s)
 | 
						|
  while [ $state != "DEPLOYED" ]
 | 
						|
  do
 | 
						|
    response=$(curl \
 | 
						|
      -s -X GET "$OS_URL/_plugins/_ml/models/$model_id" \
 | 
						|
      -H "Content-Type: application/json")
 | 
						|
    state=$(echo "$response" | jq -r '.model_state')
 | 
						|
    if [ $state == "DEPLOY_FAILED" ]; then
 | 
						|
      echo "Deploy failed."
 | 
						|
      echo "Try again later."
 | 
						|
      exit 1
 | 
						|
    fi
 | 
						|
    elapsed=$(( $(date +%s) - start_time ))
 | 
						|
    echo -ne "\rPlease wait... ${elapsed}s "
 | 
						|
    sleep 1
 | 
						|
  done
 | 
						|
  echo "done."
 | 
						|
fi
 | 
						|
 | 
						|
# Create the ingest pipeline
 | 
						|
echo -ne "Creating the ingest pipeline... "
 | 
						|
curl -s -o /dev/null \
 | 
						|
  -X PUT "$OS_URL/_ingest/pipeline/nlp-ingest-pipeline" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d "{
 | 
						|
    \"description\": \"An NLP ingest pipeline\",
 | 
						|
    \"processors\": [
 | 
						|
      {
 | 
						|
        \"remove\": {
 | 
						|
          \"field\": \"text\",
 | 
						|
          \"if\": \"ctx?.text?.trim() == ''\"
 | 
						|
        }
 | 
						|
      },
 | 
						|
      {
 | 
						|
        \"remove\": {
 | 
						|
          \"field\": \"title\",
 | 
						|
          \"if\": \"ctx?.title?.trim() == ''\"
 | 
						|
        }
 | 
						|
      },
 | 
						|
      {
 | 
						|
        \"text_embedding\": {
 | 
						|
          \"model_id\": \"$model_id\",
 | 
						|
          \"field_map\": {
 | 
						|
            \"text\": \"text_embedding\",
 | 
						|
            \"title\": \"title_embedding\"
 | 
						|
          }
 | 
						|
        }}]}"
 | 
						|
echo "done."
 | 
						|
 | 
						|
# Create the hybrid search pipeline
 | 
						|
echo -ne "Creating the hybrid search pipeline... "
 | 
						|
curl -s -o /dev/null \
 | 
						|
  -X PUT "$OS_URL/_search/pipeline/nlp-search-pipeline" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d '{
 | 
						|
    "description": "Pre and post processor for hybrid search",
 | 
						|
    "request_processors": [{
 | 
						|
      "neural_query_enricher": {
 | 
						|
        "description": "Sets the default model ID at index and field levels (which doesnt actually work)",
 | 
						|
        "default_model_id": "'"$model_id"'"
 | 
						|
      }
 | 
						|
    }],
 | 
						|
    "phase_results_processors": [{
 | 
						|
      "normalization-processor": {
 | 
						|
        "normalization": {"technique": "min_max"},
 | 
						|
        "combination": {
 | 
						|
          "technique": "arithmetic_mean",
 | 
						|
          "parameters": {"weights": [0.7,0.3]}
 | 
						|
        }
 | 
						|
      }
 | 
						|
    }]}'
 | 
						|
echo "done."
 | 
						|
 | 
						|
# Check if NLP_INDEX exists and create if note
 | 
						|
echo -ne "Checking if index $NLP_INDEX exists... "
 | 
						|
response=$(curl -s -o /dev/null -w "%{http_code}" -I "$OS_URL/$NLP_INDEX")
 | 
						|
if [ "$response" -eq 200 ]; then
 | 
						|
  echo "yes."
 | 
						|
else
 | 
						|
  echo "no."
 | 
						|
  echo -ne "Creating NLP index... "
 | 
						|
  curl -s -o /dev/null \
 | 
						|
    -X PUT "$OS_URL/$NLP_INDEX" \
 | 
						|
    -H "Content-Type: application/json" \
 | 
						|
    -d '{
 | 
						|
     "settings": {
 | 
						|
       "index.knn": true,
 | 
						|
       "default_pipeline": "nlp-ingest-pipeline"
 | 
						|
     },
 | 
						|
     "mappings": {
 | 
						|
       "properties": {
 | 
						|
         "text": {
 | 
						|
           "type": "text",
 | 
						|
           "analyzer": "english",
 | 
						|
           "fields": {
 | 
						|
             "keyword": {"type": "keyword", "ignore_above": 256},
 | 
						|
             "exact": {
 | 
						|
               "type": "text",
 | 
						|
               "analyzer": "standard"
 | 
						|
             }
 | 
						|
           }
 | 
						|
         },
 | 
						|
         "title": {
 | 
						|
           "type": "text",
 | 
						|
           "analyzer": "english",
 | 
						|
           "fields": {
 | 
						|
             "keyword": {"type": "keyword", "ignore_above": 256},
 | 
						|
             "exact": {
 | 
						|
               "type": "text",
 | 
						|
               "analyzer": "standard"
 | 
						|
             }
 | 
						|
           }
 | 
						|
         },
 | 
						|
         "title_embedding": {
 | 
						|
           "type": "knn_vector",
 | 
						|
           "dimension": 768,
 | 
						|
           "method": {
 | 
						|
             "engine": "lucene",
 | 
						|
             "space_type": "l2",
 | 
						|
             "name": "hnsw",
 | 
						|
             "parameters": {}
 | 
						|
           }
 | 
						|
         },
 | 
						|
         "text_embedding": {
 | 
						|
           "type": "knn_vector",
 | 
						|
           "dimension": 768,
 | 
						|
           "method": {
 | 
						|
             "engine": "lucene",
 | 
						|
             "space_type": "l2",
 | 
						|
             "name": "hnsw",
 | 
						|
             "parameters": {}
 | 
						|
           }
 | 
						|
          }
 | 
						|
        }
 | 
						|
      }}'
 | 
						|
  echo "done."
 | 
						|
fi
 | 
						|
 | 
						|
# Set hybrid search as default search pipeline
 | 
						|
echo -ne "Setting hybrid search as default search pipeline... "
 | 
						|
curl -s -o /dev/null \
 | 
						|
  -X PUT "$OS_URL/$NLP_INDEX/_settings" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d '{
 | 
						|
     "index.search.default_pipeline": "nlp-search-pipeline"
 | 
						|
  }'
 | 
						|
echo "done."
 | 
						|
 | 
						|
# Ask user if they want to begin reindexing
 | 
						|
kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
 | 
						|
nlp_count=$(curl -s -X GET "$OS_URL/$NLP_INDEX/_count" | jq -r '.count')
 | 
						|
 | 
						|
echo "Ready to begin re-indexing. It may take a while (~10 min)."
 | 
						|
echo "Indexed items in $KW_INDEX: $kw_count"
 | 
						|
echo "Indexed items in $NLP_INDEX: $nlp_count"
 | 
						|
read -p "Do you want to begin re-indexing? (y/n) " response
 | 
						|
if [[ "$response" =~ ^[Nn]$ ]]; then
 | 
						|
  echo "If you need to re-index $NLP_INDEX, run this script again."
 | 
						|
  exit 0
 | 
						|
fi
 | 
						|
 | 
						|
# Re-index the data
 | 
						|
task_id=$(curl -s \
 | 
						|
  -X POST "$OS_URL/_reindex?wait_for_completion=false" \
 | 
						|
  -H "Content-Type: application/json" \
 | 
						|
  -d '{
 | 
						|
       "source": {
 | 
						|
         "index": "'"$KW_INDEX"'"
 | 
						|
       },
 | 
						|
       "dest": {
 | 
						|
         "index": "'"$NLP_INDEX"'"
 | 
						|
       }
 | 
						|
     }' | jq -r '.task')
 | 
						|
echo "Re-indexing the data (task_id=$task_id)"
 | 
						|
completed="false"
 | 
						|
start_time=$(date +%s)
 | 
						|
while [ $completed != "true" ]
 | 
						|
do
 | 
						|
  completed=$(curl -s -X GET "$OS_URL/_tasks/$task_id" \
 | 
						|
    -H "Content-Type: application/json" | jq -r '.completed')
 | 
						|
  elapsed=$(( $(date +%s) - start_time ))
 | 
						|
  echo -ne "\rThis may take a while (~10 min)... ${elapsed}s "
 | 
						|
  sleep 1
 | 
						|
done
 | 
						|
echo "done."
 | 
						|
 | 
						|
kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
 | 
						|
nlp_count=$(curl -s -X GET "$OS_URL/$NLP_INDEX/_count" | jq -r '.count')
 | 
						|
echo "Indexed items in $KW_INDEX: $kw_count"
 | 
						|
echo "Indexed items in $NLP_INDEX: $nlp_count"
 | 
						|
 | 
						|
# Update .env.local
 | 
						|
echo "Updating .env.local with:"
 | 
						|
echo "OPENSEARCH_INDEX=$NLP_INDEX"
 | 
						|
echo "OPENSEARCH_MODEL_ID=$model_id"
 | 
						|
if grep -q "^OPENSEARCH_INDEX=" ".env.local"; then
 | 
						|
  sed -i '' "s|^OPENSEARCH_INDEX=.*|OPENSEARCH_INDEX=$NLP_INDEX|" ".env.local"
 | 
						|
else
 | 
						|
  echo "OPENSEARCH_INDEX=item-nlp" >> ".env.local"
 | 
						|
fi
 | 
						|
 | 
						|
if grep -q "^OPENSEARCH_MODEL_ID=" ".env.local"; then
 | 
						|
  sed -i '' "s|^OPENSEARCH_MODEL_ID=.*|OPENSEARCH_MODEL_ID=$model_id|" ".env.local"
 | 
						|
else
 | 
						|
  echo "OPENSEARCH_MODEL_ID=$model_id" >> ".env.local"
 | 
						|
fi
 | 
						|
echo "done."
 | 
						|
 | 
						|
cat <<EOF
 | 
						|
To enable semantic search, you should restart your containers:
 | 
						|
 | 
						|
> ./sndev restart
 | 
						|
 | 
						|
Do NOT rebuild the stackernews_os volume or you will have to run this
 | 
						|
process again.
 | 
						|
EOF
 |