Skip to content

Commit b97d5d9

Browse files
committed
openai rally track - binary quantization
1 parent e31e1bf commit b97d5d9

22 files changed

+1108
-0
lines changed
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/usr/bin/env python3
2+
import bz2
3+
import json
4+
import os
5+
import sys
6+
7+
import pyarrow as pa
8+
9+
OUTPUT_DIR: str = "openai-documents"
10+
INITIAL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-initial-indexing.json.bz2"
11+
PARALLEL_INDEXING_DOCS_FILENAME: str = "open_ai_corpus-parallel-indexing.json.bz2"
12+
DEFAULT_MAX_INITIAL_INDEXING_DOCS: int = -1
13+
DEFAULT_MAX_PARALLEL_INDEXING_DOCS: int = 100_000
14+
PROGRESS_EVERY = 100
15+
16+
17+
def progress_bar(count, total):
18+
bar_length = 100
19+
filled_length = int(round(bar_length * count / float(total)))
20+
percentage = round(100.0 * count / float(total), 1)
21+
bar = "=" * filled_length + "-" * (bar_length - filled_length)
22+
sys.stdout.write("[{}] {}{} ... {:,}/{:,}\r".format(bar, percentage, "%", count, total))
23+
sys.stdout.flush()
24+
25+
26+
def output_documents(input_file_path: str, max_initial_indexing_docs: int, max_parallel_indexing_docs: int):
27+
if max_parallel_indexing_docs < 0:
28+
raise ValueError("max_parallel_indexing_docs must be >= 0")
29+
30+
os.makedirs(OUTPUT_DIR, exist_ok=True)
31+
with pa.memory_map(input_file_path, "rb") as source:
32+
doc_table = pa.ipc.open_stream(source).read_all()
33+
34+
if max_initial_indexing_docs < 0:
35+
# Create as many initial indexing docs as possible while still meeting parallel indexing docs requirements
36+
initial_indexing_docs = max(0, doc_table.num_rows - max_parallel_indexing_docs)
37+
else:
38+
initial_indexing_docs = min(doc_table.num_rows, max_initial_indexing_docs)
39+
40+
parallel_indexing_docs = min(doc_table.num_rows - initial_indexing_docs, max_parallel_indexing_docs)
41+
42+
parse_documents(doc_table, initial_indexing_docs, 0, INITIAL_INDEXING_DOCS_FILENAME)
43+
parse_documents(doc_table, parallel_indexing_docs, initial_indexing_docs, PARALLEL_INDEXING_DOCS_FILENAME)
44+
45+
46+
def parse_documents(doc_table: pa.Table, doc_count: int, table_offset: int, output_filename: str):
47+
output_file_path = os.path.join(OUTPUT_DIR, output_filename)
48+
print(f"Writing {doc_count} documents to {output_file_path}")
49+
50+
with bz2.open(output_file_path, "wt") as output_file:
51+
if doc_count <= 0:
52+
# Return here so we always create the output file
53+
return
54+
55+
doc_table_sliced = doc_table.slice(offset=table_offset, length=doc_count)
56+
57+
docs_written = 0
58+
progress_bar(docs_written, doc_count)
59+
60+
for record_batch in doc_table_sliced.to_batches(max_chunksize=PROGRESS_EVERY):
61+
docid_col = record_batch.column("_id")
62+
title_col = record_batch.column("title")
63+
text_col = record_batch.column("text")
64+
emb_col = record_batch.column("embedding")
65+
for docid, title, text, emb in zip(docid_col, title_col, text_col, emb_col):
66+
output_file.write(
67+
json.dumps(
68+
{"docid": docid.as_py(), "title": title.as_py(), "text": text.as_py(), "emb": emb.as_py()}, ensure_ascii=True
69+
)
70+
)
71+
output_file.write("\n")
72+
73+
docs_written += record_batch.num_rows
74+
progress_bar(docs_written, doc_count)
75+
76+
# Print newline so that progress bar is not overwritten by next print statement
77+
print()
78+
79+
80+
def parse_arguments():
81+
if len(sys.argv) < 2:
82+
print(f"Usage: {sys.argv[0]} <input_file_path> [<max_initial_indexing_docs> <max_parallel_indexing_docs>]")
83+
exit(1)
84+
85+
if len(sys.argv) == 2:
86+
return (sys.argv[1], DEFAULT_MAX_INITIAL_INDEXING_DOCS, DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
87+
elif len(sys.argv) == 3:
88+
return (sys.argv[1], int(sys.argv[2]), DEFAULT_MAX_PARALLEL_INDEXING_DOCS)
89+
elif len(sys.argv) >= 4:
90+
return (sys.argv[1], int(sys.argv[2]), int(sys.argv[3]))
91+
92+
93+
if __name__ == "__main__":
94+
input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs = parse_arguments()
95+
output_documents(input_file_path, max_initial_indexing_docs, max_parallel_indexing_docs)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env python3
2+
import bz2
3+
import json
4+
import sys
5+
import typing
6+
7+
import pyarrow as pa
8+
9+
BATCH_SIZE: int = 1000
10+
QUERY_COLUMN: str = "embedding"
11+
OUTPUT_FILENAME: str = "queries.json.bz2"
12+
13+
14+
def output_queries(input_filename: str, queries_file: typing.TextIO):
15+
with pa.memory_map(input_filename, "rb") as source:
16+
query_table = pa.ipc.open_stream(source).read_all()
17+
for record_batch in query_table.to_batches(max_chunksize=BATCH_SIZE):
18+
query_list = record_batch.column(QUERY_COLUMN)
19+
for query in query_list:
20+
queries_file.write(json.dumps(query.as_py()))
21+
queries_file.write("\n")
22+
23+
24+
if __name__ == "__main__":
25+
if len(sys.argv) != 2:
26+
print("Usage: {} <input_file_path>".format(sys.argv[0]))
27+
exit(1)
28+
29+
input_filename = sys.argv[1]
30+
31+
with bz2.open(OUTPUT_FILENAME, "wt") as queries_file:
32+
output_queries(input_filename, queries_file)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pyarrow
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
{
2+
"name": "index-and-search",
3+
"description": "",
4+
"default": true,
5+
"schedule": [
6+
{
7+
"operation": {
8+
"operation-type": "delete-index"
9+
}
10+
},
11+
{
12+
"name": "create-index",
13+
"operation": "create-index"
14+
},
15+
{
16+
"name": "check-cluster-health",
17+
"operation": "check-cluster-health"
18+
},
19+
{
20+
"name": "initial-documents-indexing",
21+
"operation": "initial-documents-indexing",
22+
"warmup-time-period": {{ initial_indexing_bulk_warmup | default(40) | int }},
23+
"clients": {{ initial_indexing_bulk_indexing_clients | default(5) | int }}
24+
},
25+
{
26+
"name": "refresh-after-index",
27+
"operation": {
28+
"operation-type": "refresh",
29+
"request-timeout": 1000,
30+
"include-in-reporting": true
31+
}
32+
},
33+
{
34+
"name": "wait-until-merges-finish-after-index",
35+
"operation": {
36+
"operation-type": "index-stats",
37+
"index": "_all",
38+
"condition": {
39+
"path": "_all.total.merges.current",
40+
"expected-value": 0
41+
},
42+
"retry-until-success": true,
43+
"include-in-reporting": false
44+
}
45+
}
46+
{# serverless-post-ingest-sleep-marker-start #}{%- if post_ingest_sleep|default(false) -%}
47+
{
48+
"name": "post-ingest-sleep",
49+
"operation": {
50+
"operation-type": "sleep",
51+
"duration": {{ post_ingest_sleep_duration|default(30) }}
52+
}
53+
}
54+
{%- endif -%}{# serverless-post-ingest-sleep-marker-end #}
55+
{%- for i in range(p_search_ops|length) %},
56+
{
57+
{%- if p_search_ops[i][2] > 0 -%}
58+
"name": "standalone-search-knn-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}-multiple-clients",
59+
"operation": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
60+
{%- else -%}
61+
"name": "standalone-search-knn-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-multiple-clients",
62+
"operation": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
63+
{%- endif -%},
64+
"warmup-iterations": 500,
65+
"clients": {{ standalone_search_clients | default(8) | int }},
66+
"iterations": {{ standalone_search_iterations | default(10000) | int }}
67+
}
68+
{%- endfor %}
69+
{%- for i in range(p_search_ops|length) %},
70+
{
71+
{%- if p_search_ops[i][2] > 0 -%}
72+
"operation": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
73+
{%- else -%}
74+
"operation": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
75+
{%- endif -%}
76+
}
77+
{%- endfor %}
78+
]
79+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
open_ai_corpus-initial-indexing.json.bz2
2+
open_ai_corpus-initial-indexing-1k.json.bz2
3+
open_ai_corpus-parallel-indexing.json.bz2
4+
open_ai_corpus-parallel-indexing-1k.json.bz2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"settings": {
3+
{# non-serverless-index-settings-marker-start #}{%- if build_flavor != "serverless" or serverless_operator == true -%}
4+
{% if preload_pagecache %}
5+
"index.store.preload": [ "vec", "vex", "vem", "veq", "veqm", "veb", "vebm"],
6+
{% endif %}
7+
"index.number_of_shards": {{number_of_shards | default(1)}},
8+
"index.number_of_replicas": {{number_of_replicas | default(0)}}
9+
{%- endif -%}{# non-serverless-index-settings-marker-end #}
10+
},
11+
"mappings": {
12+
"dynamic": false,
13+
"properties": {
14+
"docid": {
15+
"type": "keyword"
16+
},
17+
"emb": {
18+
"type": "dense_vector",
19+
"element_type": "float",
20+
"dims": 1536,
21+
"index": true,
22+
"similarity": "dot_product",
23+
"index_options": {
24+
"type": {{ vector_index_type | default("bbq_hnsw") | tojson }},
25+
"ef_construction": 100,
26+
"m": 16
27+
}
28+
}
29+
}
30+
}
31+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{
2+
"name": "create-index",
3+
"operation-type": "create-index",
4+
"settings": {{index_settings | default({}) | tojson}}
5+
},
6+
{
7+
"name": "check-cluster-health",
8+
"operation-type": "cluster-health",
9+
"request-params": {
10+
"wait_for_status": "green"
11+
},
12+
"retry-until-success": true
13+
},
14+
{
15+
"name": "initial-documents-indexing",
16+
"operation-type": "bulk",
17+
"corpora": "openai-initial-indexing",
18+
"bulk-size": {{initial_indexing_bulk_size | default(500)}},
19+
"ingest-percentage": {{initial_indexing_ingest_percentage | default(100)}}
20+
},
21+
{
22+
"name": "parallel-documents-indexing",
23+
"operation-type": "bulk",
24+
"corpora": "openai-parallel-indexing",
25+
"bulk-size": {{parallel_indexing_bulk_size | default(500)}},
26+
"ingest-percentage": {{parallel_indexing_ingest_percentage | default(100)}}
27+
}
28+
{%- set p_search_ops = (search_ops | default([(100, 150, 150), (100, 200, 200), (100, 250, 250), (100, 300, 300), (100,500,500), (100,750,750), (100,1000,1000),(100,1200,1200),(100,1500,1500), (100,2000,2000)]))%}
29+
{%- for i in range(p_search_ops|length) %},
30+
{
31+
{%- if p_search_ops[i][2] > 0 -%}
32+
"name": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
33+
{%- else -%}
34+
"name": "knn-search-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
35+
{%- endif -%},
36+
"operation-type": "search",
37+
"param-source": "knn-param-source",
38+
"k": {{p_search_ops[i][0]}},
39+
"num-candidates": {{p_search_ops[i][1]}},
40+
"num-rescore": {{p_search_ops[i][2]}},
41+
"global-rescore": false
42+
},
43+
{
44+
{%- if p_search_ops[i][2] > 0 -%}
45+
"name": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}-{{p_search_ops[i][2]}}"
46+
{%- else -%}
47+
"name": "knn-recall-local-{{p_search_ops[i][0]}}-{{p_search_ops[i][1]}}"
48+
{%- endif -%},
49+
"operation-type": "knn-recall",
50+
"param-source": "knn-recall-param-source",
51+
"k": {{p_search_ops[i][0]}},
52+
"num-candidates": {{p_search_ops[i][1]}},
53+
"num-rescore": {{p_search_ops[i][2]}},
54+
"global-rescore": false
55+
}
56+
{%- endfor %}
Binary file not shown.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
{% import "rally.helpers" as rally with context %}
2+
3+
{
4+
"version": 2,
5+
"description": "Benchmark for vector search using the OpenAI text-embedding-ada-002 model",
6+
"indices": [
7+
{
8+
"name": "openai",
9+
"body": "index-{{ mapping_type | default("vectors-only-mapping-with-docid") }}-mapping.json"
10+
}
11+
],
12+
"corpora": [
13+
{
14+
"name": "openai-initial-indexing",
15+
"base-url": "https://rally-tracks.elastic.co/openai_vector",
16+
"documents": [
17+
{
18+
"source-file": "open_ai_corpus-initial-indexing.json.bz2",
19+
"document-count": 2580961,
20+
"compressed-bytes": 32076749416,
21+
"uncompressed-bytes": 90263571686
22+
}
23+
]
24+
},
25+
{
26+
"name": "openai-parallel-indexing",
27+
"base-url": "https://rally-tracks.elastic.co/openai_vector",
28+
"documents": [
29+
{
30+
"source-file": "open_ai_corpus-parallel-indexing.json.bz2",
31+
"document-count": 100000,
32+
"compressed-bytes": 1242787434,
33+
"uncompressed-bytes": 3497178196
34+
}
35+
]
36+
}
37+
],
38+
"operations": [
39+
{{ rally.collect(parts="operations/*.json") }}
40+
],
41+
"challenges": [
42+
{{ rally.collect(parts="challenges/*.json") }}
43+
]
44+
}

0 commit comments

Comments
 (0)