diff --git a/tests/perf/__init__.py b/tests/perf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/perf/fixtures/dense_100_768.parquet b/tests/perf/fixtures/dense_100_768.parquet new file mode 100644 index 00000000..06ec8629 Binary files /dev/null and b/tests/perf/fixtures/dense_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_0_100_768.parquet b/tests/perf/fixtures/query_matches_0_100_768.parquet new file mode 100644 index 00000000..3dbe4c30 Binary files /dev/null and b/tests/perf/fixtures/query_matches_0_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_1_100_768.parquet b/tests/perf/fixtures/query_matches_1_100_768.parquet new file mode 100644 index 00000000..2168c340 Binary files /dev/null and b/tests/perf/fixtures/query_matches_1_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_2_100_768.parquet b/tests/perf/fixtures/query_matches_2_100_768.parquet new file mode 100644 index 00000000..ab32b598 Binary files /dev/null and b/tests/perf/fixtures/query_matches_2_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_3_100_768.parquet b/tests/perf/fixtures/query_matches_3_100_768.parquet new file mode 100644 index 00000000..84ad207d Binary files /dev/null and b/tests/perf/fixtures/query_matches_3_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_4_100_768.parquet b/tests/perf/fixtures/query_matches_4_100_768.parquet new file mode 100644 index 00000000..90a1a090 Binary files /dev/null and b/tests/perf/fixtures/query_matches_4_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_5_100_768.parquet b/tests/perf/fixtures/query_matches_5_100_768.parquet new file mode 100644 index 00000000..e0e27ad6 Binary files /dev/null and b/tests/perf/fixtures/query_matches_5_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_6_100_768.parquet b/tests/perf/fixtures/query_matches_6_100_768.parquet new file mode 100644 index 00000000..3842822d Binary files /dev/null and b/tests/perf/fixtures/query_matches_6_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_7_100_768.parquet b/tests/perf/fixtures/query_matches_7_100_768.parquet new file mode 100644 index 00000000..08a5ba1f Binary files /dev/null and b/tests/perf/fixtures/query_matches_7_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_8_100_768.parquet b/tests/perf/fixtures/query_matches_8_100_768.parquet new file mode 100644 index 00000000..3a0e1a4b Binary files /dev/null and b/tests/perf/fixtures/query_matches_8_100_768.parquet differ diff --git a/tests/perf/fixtures/query_matches_9_100_768.parquet b/tests/perf/fixtures/query_matches_9_100_768.parquet new file mode 100644 index 00000000..0c51de28 Binary files /dev/null and b/tests/perf/fixtures/query_matches_9_100_768.parquet differ diff --git a/tests/perf/fixtures/sparse_100.parquet b/tests/perf/fixtures/sparse_100.parquet new file mode 100644 index 00000000..cc9fc55a Binary files /dev/null and b/tests/perf/fixtures/sparse_100.parquet differ diff --git a/tests/perf/helpers.py b/tests/perf/helpers.py new file mode 100644 index 00000000..315e9de6 --- /dev/null +++ b/tests/perf/helpers.py @@ -0,0 +1,9 @@ +import os +import pandas as pd + + +def load_fixture(fixture_name): + full_path = os.path.join(os.path.dirname(__file__), "fixtures", fixture_name) + df = pd.read_parquet(full_path) + vectors = df.to_dict(orient="records") + return vectors diff --git a/tests/perf/integration/__init__.py b/tests/perf/integration/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/perf/integration/test_query_dense.py b/tests/perf/integration/test_query_dense.py new file mode 100644 index 00000000..040c2fdc --- /dev/null +++ b/tests/perf/integration/test_query_dense.py @@ -0,0 +1,90 @@ +import json +import numpy as np +from pinecone import Pinecone +from ..helpers import load_fixture + + +def run_query(idx, query_vector, include_values, include_metadata): + resp = idx.query( + vector=query_vector, + top_k=100, + include_values=include_values, + include_metadata=include_metadata, + ) + return resp + + +def fake_results(i): + matches = load_fixture(f"query_matches_{i}_100_768.parquet") + # Convert NumPy arrays to Python lists for JSON serialization + for match in matches: + if isinstance(match.get("values"), np.ndarray): + match["values"] = match["values"].tolist() + return {"results": [], "matches": matches, "namespace": f"ns{i}", "usage": {"readUnits": 1}} + + +class TestQueryDenseVectors: + def test_query_dim768_topk100(self, benchmark, mocker): + pc = Pinecone(api_key="fake_api_key") + idx = pc.Index(host="https://fakehost.pinecone.io/") + + response_dict = fake_results(0) + + # Mock the request method + mock_request = mocker.Mock() + response = mocker.Mock() + response.configure_mock( + status=200, + headers={"content-type": "application/json"}, + getheaders=mocker.Mock(return_value={"content-type": "application/json"}), + data=json.dumps(response_dict).encode("utf-8"), + raise_for_status=mocker.Mock(), + ) + mock_request.return_value = response + idx._vector_api.api_client.rest_client.pool_manager.request = mock_request + + query_vector = load_fixture("dense_100_768.parquet") + query_vector = query_vector[0]["values"].tolist() + + # Call the benchmark with thresholds + result = benchmark( + run_query, idx, query_vector, include_values=False, include_metadata=False + ) + + assert result.results is None + assert len(result.matches) == 100 + assert result.usage.read_units == 1 + + def test_query_dim768_topk100_include_values(self, benchmark, mocker): + pc = Pinecone(api_key="fake_api_key") + idx = pc.Index(host="https://fakehost.pinecone.io/") + + response_dict = fake_results(0) + dense_vectors = load_fixture("dense_100_768.parquet") + for i, m in enumerate(response_dict["matches"]): + m["values"] = dense_vectors[i]["values"].tolist() + m["metadata"] = dense_vectors[i]["metadata"] + + # Mock the request method + mock_request = mocker.Mock() + response = mocker.Mock() + response.configure_mock( + status=200, + headers={"content-type": "application/json"}, + getheaders=mocker.Mock(return_value={"content-type": "application/json"}), + data=json.dumps(response_dict).encode("utf-8"), + raise_for_status=mocker.Mock(), + ) + mock_request.return_value = response + idx._vector_api.api_client.rest_client.pool_manager.request = mock_request + + query_vector = dense_vectors[0]["values"].tolist() + + # Call the benchmark with thresholds + result = benchmark(run_query, idx, query_vector, include_values=True, include_metadata=True) + + assert result.results is None + assert len(result.matches) == 100 + assert len(result.matches[0]["values"]) == 768 + assert result.matches[0]["metadata"] is not None + assert result.usage.read_units == 1 diff --git a/tests/perf/integration/test_upsert_dense.py b/tests/perf/integration/test_upsert_dense.py new file mode 100644 index 00000000..44658820 --- /dev/null +++ b/tests/perf/integration/test_upsert_dense.py @@ -0,0 +1,56 @@ +import json +from pinecone import Pinecone +from ..helpers import load_fixture + + +def upsert(idx, vectors): + resp = idx.upsert(vectors=vectors, batch_size=25) + return resp + + +class TestUpsertDenseVectors: + def test_upsert_n100_dim768_dict_vectors(self, benchmark, mocker): + vectors = load_fixture("dense_100_768.parquet") + pc = Pinecone(api_key="fake_api_key") + idx = pc.Index(host="https://fakehost.pinecone.io/") + + # Mock the request method + mock_request = mocker.Mock() + response = mocker.Mock() + response.configure_mock( + status=200, + headers={"content-type": "application/json"}, + getheaders=mocker.Mock(return_value={"content-type": "application/json"}), + data=json.dumps({"upsertedCount": 25}).encode("utf-8"), + raise_for_status=mocker.Mock(), + ) + mock_request.return_value = response + idx._vector_api.api_client.rest_client.pool_manager.request = mock_request + + # Call the benchmark with thresholds + result = benchmark(upsert, idx, vectors) + + assert result.upserted_count == 100 + + def test_upsert_n100_dim768_tuple_vectors(self, benchmark, mocker): + vectors = load_fixture("dense_100_768.parquet") + vectors = [(v["id"], v["values"], v["metadata"]) for v in vectors] + pc = Pinecone(api_key="fake_api_key") + idx = pc.Index(host="https://fakehost.pinecone.io/") + + # Mock the request method + mock_request = mocker.Mock() + response = mocker.Mock() + response.configure_mock( + status=200, + headers={"content-type": "application/json"}, + getheaders=mocker.Mock(return_value={"content-type": "application/json"}), + data=json.dumps({"upsertedCount": 25}).encode("utf-8"), + raise_for_status=mocker.Mock(), + ) + mock_request.return_value = response + idx._vector_api.api_client.rest_client.pool_manager.request = mock_request + + result = benchmark(upsert, idx, vectors) + + assert result.upserted_count == 100 diff --git a/tests/perf/integration/test_upsert_sparse.py b/tests/perf/integration/test_upsert_sparse.py new file mode 100644 index 00000000..97aa61e3 --- /dev/null +++ b/tests/perf/integration/test_upsert_sparse.py @@ -0,0 +1,42 @@ +import json +from pinecone import Pinecone +from ..helpers import load_fixture + + +def upsert(idx, vectors): + resp = idx.upsert(vectors=vectors, batch_size=25) + return resp + + +class TestUpsertSparseVectors: + def test_upsert_n100_dim768_dict_vectors(self, benchmark, mocker): + vectors = load_fixture("sparse_100.parquet") + vectors = [ + { + "id": v["id"], + "values": v["values"], + "sparse_values": {"indices": v["sparse_indices"], "values": v["sparse_values"]}, + "metadata": v["metadata"], + } + for v in vectors + ] + pc = Pinecone(api_key="fake_api_key") + idx = pc.Index(host="https://fakehost.pinecone.io/") + + # Mock the request method + mock_request = mocker.Mock() + response = mocker.Mock() + response.configure_mock( + status=200, + headers={"content-type": "application/json"}, + getheaders=mocker.Mock(return_value={"content-type": "application/json"}), + data=json.dumps({"upsertedCount": 25}).encode("utf-8"), + raise_for_status=mocker.Mock(), + ) + mock_request.return_value = response + idx._vector_api.api_client.rest_client.pool_manager.request = mock_request + + # Call the benchmark with thresholds + result = benchmark(upsert, idx, vectors) + + assert result.upserted_count == 100 diff --git a/tests/perf/scripts/build-dense-vectors-fixture.py b/tests/perf/scripts/build-dense-vectors-fixture.py new file mode 100644 index 00000000..690a5ed5 --- /dev/null +++ b/tests/perf/scripts/build-dense-vectors-fixture.py @@ -0,0 +1,38 @@ +import random +import string +import pandas as pd +import numpy as np +import os +import uuid + + +def random_string(length): + return "".join(random.choice(string.ascii_lowercase) for i in range(length)) + + +def random_embedding_values(dimension): + return np.random.rand(dimension).tolist() + + +def build_random_df(num_rows=1000, dimension=1536): + df = pd.DataFrame(columns=["id", "values", "metadata"]) + for i in range(num_rows): + df.loc[i] = { + "id": random_string(10), + "values": random_embedding_values(dimension), + "metadata": {"doc_id": str(uuid.uuid4()), "chunk_id": random_string(10)}, + } + return df + + +def build_dense_fixture(num_rows, dimension): + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixture_dir = os.path.join(current_dir, "..", "fixtures") + filename = os.path.join(fixture_dir, f"dense_{num_rows}_{dimension}.parquet") + df = build_random_df(num_rows, dimension) + df.to_parquet(filename, index=False) + return df + + +if __name__ == "__main__": + build_dense_fixture(num_rows=100, dimension=768) diff --git a/tests/perf/scripts/build-query-response-fixture.py b/tests/perf/scripts/build-query-response-fixture.py new file mode 100644 index 00000000..13fbc490 --- /dev/null +++ b/tests/perf/scripts/build-query-response-fixture.py @@ -0,0 +1,42 @@ +import random +import string +import pandas as pd +import os +import uuid + + +def random_string(length): + return "".join(random.choice(string.ascii_lowercase) for i in range(length)) + + +def build_random_df(num_rows, dimension): + matches = [] + for i in range(num_rows): + matches.append({"id": f"id{i}", "score": random.random(), "values": []}) + matches.sort(key=lambda x: x["score"], reverse=True) + + df = pd.DataFrame(columns=["id", "score", "values", "metadata"]) + for i in range(num_rows): + df.loc[i] = { + "id": matches[i]["id"], + "score": matches[i]["score"], + "values": matches[i]["values"], + "metadata": {"doc_id": str(uuid.uuid4()), "chunk_id": random_string(10)}, + } + return df + + +def build_query_matches_fixture(num_rows, dimension, filename): + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixture_dir = os.path.join(current_dir, "..", "fixtures") + filename = os.path.join(fixture_dir, filename) + df = build_random_df(num_rows, dimension) + df.to_parquet(filename, index=False) + return df + + +if __name__ == "__main__": + for ns in range(10): + build_query_matches_fixture( + num_rows=100, dimension=768, filename=f"query_matches_{ns}_100_768.parquet" + ) diff --git a/tests/perf/scripts/build-sparse-vectors-fixture.py b/tests/perf/scripts/build-sparse-vectors-fixture.py new file mode 100644 index 00000000..ec304757 --- /dev/null +++ b/tests/perf/scripts/build-sparse-vectors-fixture.py @@ -0,0 +1,41 @@ +import random +import string +import pandas as pd +import numpy as np +import os +import uuid + + +def random_string(length): + return "".join(random.choice(string.ascii_lowercase) for i in range(length)) + + +def random_embedding_values(dimension): + return np.random.rand(dimension).tolist() + + +def build_random_df(num_rows=1000): + df = pd.DataFrame(columns=["id", "values", "sparse_indices", "sparse_values", "metadata"]) + for i in range(num_rows): + num_elements = random.randint(50, 100) + df.loc[i] = { + "id": random_string(10), + "values": [], + "sparse_indices": [random.randint(1, 100000) for _ in range(num_elements)], + "sparse_values": [random.random() for _ in range(num_elements)], + "metadata": {"doc_id": str(uuid.uuid4()), "chunk_id": random_string(10)}, + } + return df + + +def build_sparse_fixture(num_rows): + current_dir = os.path.dirname(os.path.abspath(__file__)) + fixture_dir = os.path.join(current_dir, "..", "fixtures") + filename = os.path.join(fixture_dir, f"sparse_{num_rows}.parquet") + df = build_random_df(num_rows) + df.to_parquet(filename, index=False) + return df + + +if __name__ == "__main__": + build_sparse_fixture(num_rows=100) diff --git a/tests/perf/test_query_namespaces.py b/tests/perf/test_query_namespaces.py deleted file mode 100644 index b1103c5b..00000000 --- a/tests/perf/test_query_namespaces.py +++ /dev/null @@ -1,44 +0,0 @@ -import time -import random -import pytest -from pinecone import Pinecone - -latencies = [] - - -def call_n_threads(index): - query_vec = [random.random() for i in range(1024)] - start = time.time() - combined_results = index.query_namespaces( - vector=query_vec, - namespaces=["ns1", "ns2", "ns3", "ns4"], - include_values=False, - include_metadata=True, - filter={"publication_date": {"$eq": "Last3Months"}}, - top_k=1000, - ) - finish = time.time() - # print(f"Query took {finish-start} seconds") - latencies.append(finish - start) - - return combined_results - - -class TestQueryNamespacesRest: - # @pytest.mark.parametrize("n_threads", [4]) - # def test_query_namespaces_grpc(self, benchmark, n_threads): - # pc = PineconeGRPC() - # index = pc.Index( - # host="jen1024-dojoi3u.svc.apw5-4e34-81fa.pinecone.io", pool_threads=n_threads - # ) - # benchmark.pedantic(call_n_threads, (index,), rounds=10, warmup_rounds=1, iterations=5) - - @pytest.mark.parametrize("rest_lib", ["urllib3", "httpx-http11", "httpx-http2"]) - def test_query_namespaces_rest(self, benchmark, rest_lib): - pc = Pinecone() - index = pc.Index( - host="jen1024-dojoi3u.svc.apw5-4e34-81fa.pinecone.io", - pool_threads=4, - connection_pool_maxsize=20, - ) - benchmark.pedantic(call_n_threads, (index,), rounds=10, warmup_rounds=1, iterations=5) diff --git a/tests/perf/unit/__init__.py b/tests/perf/unit/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/perf/test_query_results_aggregator.py b/tests/perf/unit/test_query_results_aggregator.py similarity index 50% rename from tests/perf/test_query_results_aggregator.py rename to tests/perf/unit/test_query_results_aggregator.py index 9f33c149..53d26c3d 100644 --- a/tests/perf/test_query_results_aggregator.py +++ b/tests/perf/unit/test_query_results_aggregator.py @@ -1,24 +1,22 @@ -import random +import pytest from pinecone.db_data.query_results_aggregator import QueryResultsAggregator +from ..helpers import load_fixture def fake_results(i): - matches = [ - {"id": f"id{i}", "score": random.random(), "values": [random.random() for _ in range(768)]} - for _ in range(1000) - ] - matches.sort(key=lambda x: x["score"], reverse=True) + matches = load_fixture(f"query_matches_{i}_100_768.parquet") return {"namespace": f"ns{i}", "matches": matches} def aggregate_results(responses): - ag = QueryResultsAggregator(1000) + ag = QueryResultsAggregator(100, "cosine") for response in responses: ag.add_results(response) return ag.get_results() class TestQueryResultsAggregatorPerf: - def test_my_stuff(self, benchmark): - responses = [fake_results(i) for i in range(10)] + @pytest.mark.parametrize("num_namespaces", [1, 5, 10]) + def test_merge_query_results(self, benchmark, num_namespaces): + responses = [fake_results(i) for i in range(num_namespaces)] benchmark(aggregate_results, responses) diff --git a/tests/perf/unit/test_sparse_vector_factory.py b/tests/perf/unit/test_sparse_vector_factory.py new file mode 100644 index 00000000..c3ac6993 --- /dev/null +++ b/tests/perf/unit/test_sparse_vector_factory.py @@ -0,0 +1,17 @@ +from pinecone.db_data.sparse_values_factory import SparseValuesFactory +from ..helpers import load_fixture + + +def build_sparse_values_objects(vectors): + for row in vectors: + SparseValuesFactory.build(row) + + +class TestSparseVectorFactoryPerf: + def test_sparse_vector_factory_100_dict(self, benchmark): + sparse_values_data = load_fixture("sparse_100.parquet") + vectors = [ + {"indices": row["sparse_indices"], "values": row["sparse_values"]} + for row in sparse_values_data + ] + benchmark(build_sparse_values_objects, vectors) diff --git a/tests/perf/unit/test_vector_factory.py b/tests/perf/unit/test_vector_factory.py new file mode 100644 index 00000000..8c2ac37b --- /dev/null +++ b/tests/perf/unit/test_vector_factory.py @@ -0,0 +1,18 @@ +from pinecone.db_data.vector_factory import VectorFactory +from ..helpers import load_fixture + + +def build_vector_objects(vector_data): + for row in vector_data: + VectorFactory.build(row) + + +class TestVectorFactoryPerf: + def test_vector_factory_100_768_dict(self, benchmark): + vectors = load_fixture("dense_100_768.parquet") + benchmark(build_vector_objects, vectors) + + def test_vector_factory_100_768_tuple(self, benchmark): + vectors = load_fixture("dense_100_768.parquet") + vectors = [(row["id"], row["values"], row["metadata"]) for row in vectors] + benchmark(build_vector_objects, vectors) diff --git a/tests/perf/unit/test_vector_request_factory.py b/tests/perf/unit/test_vector_request_factory.py new file mode 100644 index 00000000..03425db7 --- /dev/null +++ b/tests/perf/unit/test_vector_request_factory.py @@ -0,0 +1,30 @@ +import pytest +from pinecone.db_data.request_factory import IndexRequestFactory +from ..helpers import load_fixture + + +def build_upsert_request(vector_data, _check_type): + IndexRequestFactory.upsert_request( + vectors=vector_data, namespace="ns1", _check_type=_check_type + ) + + +class TestVectorRequestFactoryPerf: + @pytest.mark.parametrize("check_type", [True, False]) + def test_upsert_request_dense_100_768_dict(self, benchmark, check_type): + vectors = load_fixture("dense_100_768.parquet") + benchmark(build_upsert_request, vectors, check_type) + + @pytest.mark.parametrize("check_type", [True, False]) + def test_upsert_request_sparse_100_dict(self, benchmark, check_type): + vectors = load_fixture("sparse_100.parquet") + vectors = [ + { + "id": row["id"], + "values": row["values"], + "sparse_values": {"indices": row["sparse_indices"], "values": row["sparse_values"]}, + "metadata": row["metadata"], + } + for row in vectors + ] + benchmark(build_upsert_request, vectors, check_type)