elastic
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/_tools/parse_documents.py
Lines changed: 95 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/_tools/parse_documents.py
Lines changed: 95 additions & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/_tools/parse_queries.py
Lines changed: 32 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/_tools/parse_queries.py
Lines changed: 32 additions & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/_tools/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/_tools/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/challenges/default.json
Lines changed: 79 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/challenges/default.json
Lines changed: 79 additions & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/files.txt
Lines changed: 4 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/files.txt
Lines changed: 4 additions & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/index-vectors-only-mapping-with-docid-mapping.json
Lines changed: 31 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/index-vectors-only-mapping-with-docid-mapping.json
Lines changed: 31 additions & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/open_ai_true_top_1000.json.bz2
10 MB b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/open_ai_true_top_1000.json.bz2
10 MB
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/operations/default.json
Lines changed: 56 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/operations/default.json
Lines changed: 56 additions & 0 deletions
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/queries.json.bz2
40.1 MB b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/queries.json.bz2
40.1 MB
diff --git a/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/track.json
Lines changed: 44 additions & 0 deletions b/‎rally-custom/custom_tracks/elasticsearch/openai_vector_bq/track.json
Lines changed: 44 additions & 0 deletions
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+import bz2
+import json
+import os
+import sys
+
+import pyarrow as pa
+
+OUTPUT_DIR: str = "openai-documents"
+INITIAL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-initial-indexing.json.bz2"
+PARALLEL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-parallel-indexing.json.bz2"
+DEFAULT_MAX_INITIAL_INDEXING_DOCS: int = -1
+DEFAULT_MAX_PARALLEL_INDEXING_DOCS: int = 100_000
+PROGRESS_EVERY = 100
+
+
+def progress_bar(count, total):
+    bar_length = 100
+    filled_length = int(round(bar_length * count / float(total)))
+    percentage = round(100.0 * count / float(total), 1)
+    bar = "=" * filled_length + "-" * (bar_length - filled_length)
+    sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
+    sys.stdout.flush()
+
+
+def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int):
+    if max_parallel_indexing_docs < 0:
+        raise ValueError("max_parallel_indexing_docs must be >= 0")
+
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    with pa.memory_map(input_file_path, "rb") as source:
+        doc_table = pa.ipc.open_stream(source).read_all()
+
+        if max_initial_indexing_docs < 0:
+            # Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements
+            initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs)
+        else:
+            initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs)
+
+        parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs)
+
+        parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME)
+        parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME)
+
+
+def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str):
+    output_file_path = os.path.join(OUTPUT_DIR, output_filename)
+    print(f"Writing {doc_count} documents to {output_file_path}")
+
+    with bz2.open(output_file_path, "wt") as output_file:
+        if doc_count <= 0:
+            # Return here so we always create the output file
+            return
+
+        doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count)
+
+        docs_written = 0
+        progress_bar(docs_written, doc_count)
+
+        for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY):
+            docid_col = record_batch.column("_id")
+            title_col = record_batch.column("title")
+            text_col = record_batch.column("text")
+            emb_col = record_batch.column("embedding")
+            for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col):
+                output_file.write(
+                    json.dumps(
+                        {"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True
+                    )
+                )
+                output_file.write("\n")
+
+            docs_written += record_batch.num_rows
+            progress_bar(docs_written, doc_count)
+
+    # Print newline so that progress bar is not overwritten by next print statement
+    print()
+
+
+def parse_arguments():
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} <input_file_path> [<max_initial_indexing_docs> <max_parallel_indexing_docs>]")
+        exit(1)
+
+    if len(sys.argv) == 2:
+        return (sys.argv[1], DEFAULT_MAX_INITIAL_INDEXING_DOCS, DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
+    elif len(sys.argv) == 3:
+        return (sys.argv[1], int(sys.argv[2]), DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
+    elif len(sys.argv) >= 4:
+        return (sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
+
+
+if __name__ == "__main__":
+    input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs = parse_arguments()
+    output_documents(input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs)
@@ -0,0 +1,32 @@
+#!/usr/bin/env python3
+import bz2
+import json
+import sys
+import typing
+
+import pyarrow as pa
+
+BATCH_SIZE: int = 1000
+QUERY_COLUMN: str = "embedding"
+OUTPUT_FILENAME: str = "queries.json.bz2"
+
+
+def output_queries(input_filename: str, queries_file: typing.TextIO):
+    with pa.memory_map(input_filename, "rb") as source:
+        query_table = pa.ipc.open_stream(source).read_all()
+        for record_batch in query_table.to_batches(max_chunksize=BATCH_SIZE):
+            query_list = record_batch.column(QUERY_COLUMN)
+            for query in query_list:
+                queries_file.write(json.dumps(query.as_py()))
+                queries_file.write("\n")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: {} <input_file_path>".format(sys.argv[0]))
+        exit(1)
+
+    input_filename = sys.argv[1]
+
+    with bz2.open(OUTPUT_FILENAME, "wt") as queries_file:
+        output_queries(input_filename, queries_file)
@@ -0,0 +1 @@
+pyarrow
@@ -0,0 +1,79 @@
+{
+  "name": "index-and-search",
+  "description": "",
+  "default": true,
+  "schedule": [
+    {
+      "operation": {
+        "operation-type": "delete-index"
+      }
+    },
+    {
+      "name": "create-index",
+      "operation": "create-index"
+    },
+    {
+      "name": "check-cluster-health",
+      "operation": "check-cluster-health"
+    },
+    {
+      "name": "initial-documents-indexing",
+      "operation": "initial-documents-indexing",
+      "warmup-time-period": {{ initial_indexing_bulk_warmup | default(40) | int }},
+      "clients": {{ initial_indexing_bulk_indexing_clients | default(5) | int }}
+    },
+    {
+      "name": "refresh-after-index",
+      "operation": {
+        "operation-type": "refresh",
+        "request-timeout": 1000,
+        "include-in-reporting": true
+      }
+    },
+    {
+      "name": "wait-until-merges-finish-after-index",
+      "operation": {
+        "operation-type": "index-stats",
+        "index": "_all",
+        "condition": {
+          "path": "_all.total.merges.current",
+          "expected-value": 0
+        },
+        "retry-until-success": true,
+        "include-in-reporting": false
+      }
+    }
+    {# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%}
+    {
+      "name": "post-ingest-sleep",
+      "operation": {
+        "operation-type": "sleep",
+        "duration": {{ post_ingest_sleep_duration|default(30) }}
+      }
+    }
+    {%- endif -%}{# serverless-post-ingest-sleep-marker-end #}
+    {%- for i in range(p_search_ops|length) %},
+    {
+      {%- if p_search_ops[i][2] > 0 -%}
+        "name": "standalone-search-knn-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-multiple-clients",
+        "operation": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+      {%- else -%}
+        "name": "standalone-search-knn-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-multiple-clients",
+        "operation": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+      {%- endif -%},
+      "warmup-iterations": 500,
+      "clients": {{ standalone_search_clients | default(8) | int }},
+      "iterations": {{ standalone_search_iterations | default(10000) | int }}
+    }
+    {%- endfor %}
+    {%- for i in range(p_search_ops|length) %},
+    {
+      {%- if p_search_ops[i][2] > 0 -%}
+        "operation": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+      {%- else -%}
+        "operation": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+      {%- endif -%}
+    }
+    {%- endfor %}
+  ]
+}
@@ -0,0 +1,4 @@
+open_ai_corpus-initial-indexing.json.bz2
+open_ai_corpus-initial-indexing-1k.json.bz2
+open_ai_corpus-parallel-indexing.json.bz2
+open_ai_corpus-parallel-indexing-1k.json.bz2
@@ -0,0 +1,31 @@
+{
+  "settings": {
+    {# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
+      {% if preload_pagecache %}
+    "index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"],
+      {% endif %}
+    "index.number_of_shards": {{number_of_shards | default(1)}},
+    "index.number_of_replicas": {{number_of_replicas | default(0)}}
+    {%- endif -%}{# non-serverless-index-settings-marker-end #}
+  },
+  "mappings": {
+    "dynamic": false,
+    "properties": {
+      "docid": {
+        "type": "keyword"
+      },
+      "emb": {
+        "type": "dense_vector",
+        "element_type": "float",
+        "dims": 1536,
+        "index": true,
+        "similarity": "dot_product",
+        "index_options": {
+          "type": {{ vector_index_type | default("bbq_hnsw") | tojson }},
+          "ef_construction": 100,
+          "m": 16
+        }
+      }
+    }
+  }
+}
@@ -0,0 +1,56 @@
+{
+  "name": "create-index",
+  "operation-type": "create-index",
+  "settings": {{index_settings | default({}) | tojson}}
+},
+{
+  "name": "check-cluster-health",
+  "operation-type": "cluster-health",
+  "request-params": {
+    "wait_for_status": "green"
+  },
+  "retry-until-success": true
+},
+{
+  "name": "initial-documents-indexing",
+  "operation-type": "bulk",
+  "corpora": "openai-initial-indexing",
+  "bulk-size": {{initial_indexing_bulk_size | default(500)}},
+  "ingest-percentage": {{initial_indexing_ingest_percentage | default(100)}}
+},
+{
+  "name": "parallel-documents-indexing",
+  "operation-type": "bulk",
+  "corpora": "openai-parallel-indexing",
+  "bulk-size": {{parallel_indexing_bulk_size | default(500)}},
+  "ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}}
+}
+{%- set p_search_ops = (search_ops | default([(100, 150, 150), (100, 200, 200), (100, 250, 250), (100, 300, 300), (100,500,500), (100,750,750), (100,1000,1000),(100,1200,1200),(100,1500,1500), (100,2000,2000)]))%}
+{%- for i in range(p_search_ops|length) %},
+{
+  {%- if p_search_ops[i][2] > 0 -%}
+    "name": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+  {%- else -%}
+    "name": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+  {%- endif -%},
+  "operation-type": "search",
+  "param-source": "knn-param-source",
+  "k": {{p_search_ops[i][0]}},
+  "num-candidates": {{p_search_ops[i][1]}},
+  "num-rescore": {{p_search_ops[i][2]}},
+  "global-rescore": false
+},
+{
+  {%- if p_search_ops[i][2] > 0 -%}
+    "name": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
+  {%- else -%}
+    "name": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
+  {%- endif -%},
+  "operation-type": "knn-recall",
+  "param-source": "knn-recall-param-source",
+  "k": {{p_search_ops[i][0]}},
+  "num-candidates": {{p_search_ops[i][1]}},
+  "num-rescore": {{p_search_ops[i][2]}},
+  "global-rescore": false
+}
+{%- endfor %}
@@ -0,0 +1,44 @@
+{% import "rally.helpers" as rally with context %}
+
+{
+  "version": 2,
+  "description": "Benchmark for vector search using the OpenAI text-embedding-ada-002 model",
+  "indices": [
+    {
+      "name": "openai",
+      "body": "index-{{ mapping_type | default("vectors-only-mapping-with-docid") }}-mapping.json"
+    }
+  ],
+  "corpora": [
+    {
+      "name": "openai-initial-indexing",
+      "base-url": "https://rally-tracks.elastic.co/openai_vector",
+      "documents": [
+        {
+          "source-file": "open_ai_corpus-initial-indexing.json.bz2",
+          "document-count": 2580961,
+          "compressed-bytes": 32076749416,
+          "uncompressed-bytes": 90263571686
+        }
+      ]
+    },
+    {
+      "name": "openai-parallel-indexing",
+      "base-url": "https://rally-tracks.elastic.co/openai_vector",
+      "documents": [
+        {
+          "source-file": "open_ai_corpus-parallel-indexing.json.bz2",
+          "document-count": 100000,
+          "compressed-bytes": 1242787434,
+          "uncompressed-bytes": 3497178196
+        }
+      ]
+    }
+  ],
+  "operations": [
+    {{ rally.collect(parts="operations/*.json") }}
+  ],
+  "challenges": [
+    {{ rally.collect(parts="challenges/*.json") }}
+  ]
+}