stacker.news/scripts/nlp-setup
m0wer f12c03198d
Exact search (#2135)
* feat: add exact search for quoted phrases/words

* feat: get some highlighting for exact search

* feat: Add exact search for title and text fields in OpenSearch

* simplify and make it work with nlp script

---------

Co-authored-by: Keyan <34140557+huumn@users.noreply.github.com>
Co-authored-by: k00b <k00b@stacker.news>
2025-05-15 09:11:58 -05:00

420 lines
12 KiB
Bash
Executable File

#!/bin/bash
# ----------------------------------------------
# usage() function - prints help/usage message
# ----------------------------------------------
usage() {
cat <<EOF
Usage: nlp-setup [OPTIONS] [KW_INDEX] [NLP_INDEX]
This script will setup a model and index in opensearch to enable semantic
search. It assumes the existence of a default, keyword based index that has
already been fully indexed. It then creates a new index for semantic search
and re-indexes it using the items from the old index.
It then sets:
OPENSEARCH_INDEX=<NLP_INDEX>
OPENSEARCH_MODEL_ID=<MODEL_ID>
in .env.local.
After running this script, you will need to remove and rebuild your containers
using "sndev stop" and "sndev start", in order to enable semantic search.
Options:
-h, --help Display this help message and exit
Arguments:
KW_INDEX The name of the keyword index (default: item)
NLP_INDEX The name of the semantic index (default: item-nlp)
EOF
}
# -------------------------------------------------
# Check if user requested help via -h or --help
# -------------------------------------------------
for arg in "$@"; do
case "$arg" in
-h|--help)
usage
exit 0
;;
esac
done
# ---------------------------------------
# Set defaults if not provided
# ---------------------------------------
KW_INDEX="${1:-item}"
NLP_INDEX="${2:-item-nlp}"
# ---------------------------------------
# Main script
# ---------------------------------------
OS_URL="http://localhost:9200"
MODEL_NAME="huggingface/sentence-transformers/all-mpnet-base-v2"
set -e
# Ensure that search is in COMPOSE_PROFILES
COMPOSE_PROFILES=$(docker exec app printenv COMPOSE_PROFILES)
if [[ ! "$COMPOSE_PROFILES" == *"search"* ]]; then
cat <<EOF
Please ensure that COMPOSE_PROFILES contains search, then restart the
containers and try again.
EOF
exit 1
fi
# Ensure that KW_INDEX is reachable
echo -ne "Checking that index $KW_INDEX is reachable... "
response=$(curl -s -o /dev/null -w "%{http_code}" -I "$OS_URL/$KW_INDEX")
if [ "$response" -eq 200 ]; then
echo "yes."
else
echo "no."
cat <<EOF
An index named $KW_INDEX must exist in your stackernews_os volume.
If you just started up the container, wait a while and try again.
Otherwise, you may need to delete and rebuild the opensearch container and
stackernews_os volume. Check the value of OPENSEARCH_INDEX in your env
variables.
EOF
exit 1
fi
# Check if KW_INDEX is still indexing
kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
sleep 2
kw_count_2=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
if [ "$kw_count_2" != "$kw_count" ] || [ "$kw_count_2" -lt 5000 ]; then
echo "It appears that $KW_INDEX is not done indexing."
echo "Please wait until it finishes indexing, then try again."
exit 0
fi
# Configure the ML plugin
echo -ne "Configuring the ML plugin... "
curl \
-s -o /dev/null \
-X PUT "$OS_URL/_cluster/settings" \
-H "Content-Type: application/json" \
-d '{
"persistent": {
"plugins.ml_commons.only_run_on_ml_node": "false",
"plugins.ml_commons.model_access_control_enabled": "true",
"plugins.ml_commons.native_memory_threshold": "99"
}}'
echo "done."
# Check if a local model group is registered and register if not
echo -ne "Checking if local model group is registered... "
response=$(curl -s \
-X POST "$OS_URL/_plugins/_ml/model_groups/_search" \
-H "Content-Type: application/json" \
-d '{
"query": {
"term": {
"name.keyword": "local_model_group"
}
}}')
exists=$(echo "$response" | jq -r '.hits.total.value')
if [ "$exists" -gt 0 ]; then
echo "yes."
model_group_id=$(echo "$response" | jq -r '.hits.hits[0]._id')
else
echo "no."
echo "Creating local model group."
model_group_id=$(curl -s \
-X POST "$OS_URL/_plugins/_ml/model_groups/_register" \
-H "Content-Type: application/json" \
-d '{
"name": "local_model_group",
"description": "A model group for local models"
}' | jq -r '.model_group_id')
fi
echo "model_group_id=$model_group_id"
# Check if the model is registered and register if not
echo -ne "Checking if the NLP model is registered... "
response=$(curl \
-s -X POST "$OS_URL/_plugins/_ml/models/_search" \
-H "Content-Type: application/json" \
-d '{
"query": {
"bool": {
"must": [
{"term": {"name.keyword": "'"$MODEL_NAME"'"}},
{"term": {"model_group_id": "'"$model_group_id"'"}}
]
}
}}')
exists=$(echo "$response" | jq -r '.hits.total.value')
if [ "$exists" -gt 0 ]; then
model_id=$(echo "$response" | jq -r '.hits.hits[0]._id')
echo "yes."
else
echo "no."
# Create the model
task_id=$(curl \
-s -X POST "$OS_URL/_plugins/_ml/models/_register" \
-H "Content-Type: application/json" \
-d '{
"name": "'"$MODEL_NAME"'",
"version": "1.0.1",
"model_group_id": "'"$model_group_id"'",
"model_format": "TORCH_SCRIPT"
}' | jq -r '.task_id')
echo "Downloading and registering the model (task_id=$task_id)"
state="INCOMPLETE"
start_time=$(date +%s)
while [ $state != "COMPLETED" ]
do
response=$(curl \
-s -X GET "$OS_URL/_plugins/_ml/tasks/$task_id" \
-H "Content-Type: application/json")
state=$(echo "$response" | jq -r '.state')
elapsed=$(( $(date +%s) - start_time ))
echo -ne "\rPlease wait (~1 min)... ${elapsed}s "
sleep 1
done
echo "done."
model_id=$(echo "$response" | jq -r '.model_id')
fi
echo "model_id=$model_id"
# Check if the model is deployed and deploy if not
echo -ne "Checking if the model is deployed... "
response=$(curl \
-s -X GET "$OS_URL/_plugins/_ml/models/$model_id" \
-H "Content-Type: application/json")
state=$(echo "$response" | jq -r '.model_state')
if [ $state == "DEPLOYED" ]; then
echo "yes."
else
echo "no."
task_id=$(curl -s \
-X POST "$OS_URL/_plugins/_ml/models/$model_id/_deploy" \
-H "Content-Type: application/json" | jq -r '.task_id')
echo "Deploying the model (task_id=$task_id)"
start_time=$(date +%s)
while [ $state != "DEPLOYED" ]
do
response=$(curl \
-s -X GET "$OS_URL/_plugins/_ml/models/$model_id" \
-H "Content-Type: application/json")
state=$(echo "$response" | jq -r '.model_state')
if [ $state == "DEPLOY_FAILED" ]; then
echo "Deploy failed."
echo "Try again later."
exit 1
fi
elapsed=$(( $(date +%s) - start_time ))
echo -ne "\rPlease wait... ${elapsed}s "
sleep 1
done
echo "done."
fi
# Create the ingest pipeline
echo -ne "Creating the ingest pipeline... "
curl -s -o /dev/null \
-X PUT "$OS_URL/_ingest/pipeline/nlp-ingest-pipeline" \
-H "Content-Type: application/json" \
-d "{
\"description\": \"An NLP ingest pipeline\",
\"processors\": [
{
\"remove\": {
\"field\": \"text\",
\"if\": \"ctx?.text?.trim() == ''\"
}
},
{
\"remove\": {
\"field\": \"title\",
\"if\": \"ctx?.title?.trim() == ''\"
}
},
{
\"text_embedding\": {
\"model_id\": \"$model_id\",
\"field_map\": {
\"text\": \"text_embedding\",
\"title\": \"title_embedding\"
}
}}]}"
echo "done."
# Create the hybrid search pipeline
echo -ne "Creating the hybrid search pipeline... "
curl -s -o /dev/null \
-X PUT "$OS_URL/_search/pipeline/nlp-search-pipeline" \
-H "Content-Type: application/json" \
-d '{
"description": "Pre and post processor for hybrid search",
"request_processors": [{
"neural_query_enricher": {
"description": "Sets the default model ID at index and field levels (which doesnt actually work)",
"default_model_id": "'"$model_id"'"
}
}],
"phase_results_processors": [{
"normalization-processor": {
"normalization": {"technique": "min_max"},
"combination": {
"technique": "arithmetic_mean",
"parameters": {"weights": [0.7,0.3]}
}
}
}]}'
echo "done."
# Check if NLP_INDEX exists and create if note
echo -ne "Checking if index $NLP_INDEX exists... "
response=$(curl -s -o /dev/null -w "%{http_code}" -I "$OS_URL/$NLP_INDEX")
if [ "$response" -eq 200 ]; then
echo "yes."
else
echo "no."
echo -ne "Creating NLP index... "
curl -s -o /dev/null \
-X PUT "$OS_URL/$NLP_INDEX" \
-H "Content-Type: application/json" \
-d '{
"settings": {
"index.knn": true,
"default_pipeline": "nlp-ingest-pipeline"
},
"mappings": {
"properties": {
"text": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"exact": {
"type": "text",
"analyzer": "standard"
}
}
},
"title": {
"type": "text",
"analyzer": "english",
"fields": {
"keyword": {"type": "keyword", "ignore_above": 256},
"exact": {
"type": "text",
"analyzer": "standard"
}
}
},
"title_embedding": {
"type": "knn_vector",
"dimension": 768,
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
},
"text_embedding": {
"type": "knn_vector",
"dimension": 768,
"method": {
"engine": "lucene",
"space_type": "l2",
"name": "hnsw",
"parameters": {}
}
}
}
}}'
echo "done."
fi
# Set hybrid search as default search pipeline
echo -ne "Setting hybrid search as default search pipeline... "
curl -s -o /dev/null \
-X PUT "$OS_URL/$NLP_INDEX/_settings" \
-H "Content-Type: application/json" \
-d '{
"index.search.default_pipeline": "nlp-search-pipeline"
}'
echo "done."
# Ask user if they want to begin reindexing
kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
nlp_count=$(curl -s -X GET "$OS_URL/$NLP_INDEX/_count" | jq -r '.count')
echo "Ready to begin re-indexing. It may take a while (~10 min)."
echo "Indexed items in $KW_INDEX: $kw_count"
echo "Indexed items in $NLP_INDEX: $nlp_count"
read -p "Do you want to begin re-indexing? (y/n) " response
if [[ "$response" =~ ^[Nn]$ ]]; then
echo "If you need to re-index $NLP_INDEX, run this script again."
exit 0
fi
# Re-index the data
task_id=$(curl -s \
-X POST "$OS_URL/_reindex?wait_for_completion=false" \
-H "Content-Type: application/json" \
-d '{
"source": {
"index": "'"$KW_INDEX"'"
},
"dest": {
"index": "'"$NLP_INDEX"'"
}
}' | jq -r '.task')
echo "Re-indexing the data (task_id=$task_id)"
completed="false"
start_time=$(date +%s)
while [ $completed != "true" ]
do
completed=$(curl -s -X GET "$OS_URL/_tasks/$task_id" \
-H "Content-Type: application/json" | jq -r '.completed')
elapsed=$(( $(date +%s) - start_time ))
echo -ne "\rThis may take a while (~10 min)... ${elapsed}s "
sleep 1
done
echo "done."
kw_count=$(curl -s -X GET "$OS_URL/$KW_INDEX/_count" | jq -r '.count')
nlp_count=$(curl -s -X GET "$OS_URL/$NLP_INDEX/_count" | jq -r '.count')
echo "Indexed items in $KW_INDEX: $kw_count"
echo "Indexed items in $NLP_INDEX: $nlp_count"
# Update .env.local
echo "Updating .env.local with:"
echo "OPENSEARCH_INDEX=$NLP_INDEX"
echo "OPENSEARCH_MODEL_ID=$model_id"
if grep -q "^OPENSEARCH_INDEX=" ".env.local"; then
sed -i '' "s|^OPENSEARCH_INDEX=.*|OPENSEARCH_INDEX=$NLP_INDEX|" ".env.local"
else
echo "OPENSEARCH_INDEX=item-nlp" >> ".env.local"
fi
if grep -q "^OPENSEARCH_MODEL_ID=" ".env.local"; then
sed -i '' "s|^OPENSEARCH_MODEL_ID=.*|OPENSEARCH_MODEL_ID=$model_id|" ".env.local"
else
echo "OPENSEARCH_MODEL_ID=$model_id" >> ".env.local"
fi
echo "done."
cat <<EOF
To enable semantic search, you should restart your containers:
> ./sndev restart
Do NOT rebuild the stackernews_os volume or you will have to run this
process again.
EOF