From d92ced2adaa30a0405ace9ca6cd70a8e217f13d0 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 5 Mar 2024 20:56:16 +0000 Subject: [PATCH 01/21] feat: Support BYOSA in `remote_function` (#407) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 328138730 🦕 --- bigframes/functions/remote_function.py | 21 +++++++++-- bigframes/pandas/__init__.py | 2 + bigframes/session/__init__.py | 9 +++++ tests/system/large/test_remote_function.py | 43 ++++++++++++++++++++++ 4 files changed, 71 insertions(+), 4 deletions(-) diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index c31105a021..5bc8291f59 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -129,6 +129,7 @@ def __init__( bq_connection_client, bq_connection_id, cloud_resource_manager_client, + cloud_function_service_account, ): self._gcp_project_id = gcp_project_id self._cloud_function_region = cloud_function_region @@ -140,6 +141,7 @@ def __init__( self._bq_connection_manager = clients.BqConnectionManager( bq_connection_client, cloud_resource_manager_client ) + self._cloud_function_service_account = cloud_function_service_account def create_bq_remote_function( self, input_args, input_types, output_type, endpoint, bq_function_name @@ -384,6 +386,9 @@ def create_cloud_function(self, def_, cf_name, package_requirements=None): function.service_config = functions_v2.ServiceConfig() function.service_config.available_memory = "1024M" function.service_config.timeout_seconds = 600 + function.service_config.service_account_email = ( + self._cloud_function_service_account + ) create_function_request.function = function # Create the cloud function and wait for it to be ready to use @@ -591,6 +596,7 @@ def remote_function( reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. @@ -646,12 +652,12 @@ def remote_function( Client to use for BigQuery operations. If this param is not provided then bigquery client from the session would be used. bigquery_connection_client (google.cloud.bigquery_connection_v1.ConnectionServiceClient, Optional): - Client to use for cloud functions operations. If this param is not - provided then functions client from the session would be used. - cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): Client to use for BigQuery connection operations. If this param is not provided then bigquery connection client from the session would be used. + cloud_functions_client (google.cloud.functions_v2.FunctionServiceClient, Optional): + Client to use for cloud functions operations. If this param is not + provided then the functions client from the session would be used. resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): Client to use for cloud resource management operations, e.g. for getting and setting IAM roles on cloud resources. If this param is @@ -686,7 +692,13 @@ def remote_function( Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. - + cloud_function_service_account (str, Optional): + Service account to use for the cloud functions. If not provided then + the default service account would be used. See + https://cloud.google.com/functions/docs/securing/function-identity + for more details. Please make sure the service account has the + necessary IAM permissions configured as described in + https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. """ import bigframes.pandas as bpd @@ -787,6 +799,7 @@ def wrapper(f): bigquery_connection_client, bq_connection_id, resource_manager_client, + cloud_function_service_account, ) rf_name, cf_name = remote_function_client.provision_bq_remote_function( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 110978a7f1..3c9bb003cc 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -615,6 +615,7 @@ def remote_function( reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, ): return global_session.with_default_session( bigframes.session.Session.remote_function, @@ -625,6 +626,7 @@ def remote_function( reuse=reuse, name=name, packages=packages, + cloud_function_service_account=cloud_function_service_account, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4bd205afea..ef4a349244 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1337,6 +1337,7 @@ def remote_function( reuse: bool = True, name: Optional[str] = None, packages: Optional[Sequence[str]] = None, + cloud_function_service_account: Optional[str] = None, ): """Decorator to turn a user defined function into a BigQuery remote function. Check out the code samples at: https://cloud.google.com/bigquery/docs/remote-functions#bigquery-dataframes. @@ -1410,6 +1411,13 @@ def remote_function( Explicit name of the external package dependencies. Each dependency is added to the `requirements.txt` as is, and can be of the form supported in https://pip.pypa.io/en/stable/reference/requirements-file-format/. + cloud_function_service_account (str, Optional): + Service account to use for the cloud functions. If not provided + then the default service account would be used. See + https://cloud.google.com/functions/docs/securing/function-identity + for more details. Please make sure the service account has the + necessary IAM permissions configured as described in + https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration. Returns: callable: A remote function object pointing to the cloud assets created in the background to support the remote execution. The cloud assets can be @@ -1428,6 +1436,7 @@ def remote_function( reuse=reuse, name=name, packages=packages, + cloud_function_service_account=cloud_function_service_account, ) def read_gbq_function( diff --git a/tests/system/large/test_remote_function.py b/tests/system/large/test_remote_function.py index b33298ae01..77aa3c7603 100644 --- a/tests/system/large/test_remote_function.py +++ b/tests/system/large/test_remote_function.py @@ -1279,3 +1279,46 @@ def square(x): cleanup_remote_function_assets( session.bqclient, session.cloudfunctionsclient, square ) + + +@pytest.mark.skip("This requires additional project config.") +def test_remote_function_via_session_custom_sa(scalars_dfs): + # Set these values to run the test locally + # TODO(shobs): Automate and enable this test + PROJECT = "" + GCF_SERVICE_ACCOUNT = "" + + rf_session = bigframes.Session(context=bigframes.BigQueryOptions(project=PROJECT)) + + try: + + @rf_session.remote_function( + [int], int, reuse=False, cloud_function_service_account=GCF_SERVICE_ACCOUNT + ) + def square_num(x): + if x is None: + return x + return x * x + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_int64_col = scalars_df["int64_col"] + bf_result_col = bf_int64_col.apply(square_num) + bf_result = bf_int64_col.to_frame().assign(result=bf_result_col).to_pandas() + + pd_int64_col = scalars_pandas_df["int64_col"] + pd_result_col = pd_int64_col.apply(lambda x: x if x is None else x * x) + pd_result = pd_int64_col.to_frame().assign(result=pd_result_col) + + assert_pandas_df_equal(bf_result, pd_result, check_dtype=False) + + # Assert that the GCF is created with the intended SA + gcf = rf_session.cloudfunctionsclient.get_function( + name=square_num.bigframes_cloud_function + ) + assert gcf.service_config.service_account_email == GCF_SERVICE_ACCOUNT + finally: + # clean up the gcp assets created for the remote function + cleanup_remote_function_assets( + rf_session.bqclient, rf_session.cloudfunctionsclient, square_num + ) From 6478ad75a98fcd3841ab701fd1f5ae0ddc49f761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=2C=20formerly=29?= Date: Tue, 5 Mar 2024 16:45:31 -0600 Subject: [PATCH 02/21] chore: add load tests session for reading large tables (#410) * chore: add load tests session for reading large tables * update junit prefix * xfail for to_pandas_batches * use smaller table but still beyond query results limit --- .kokoro/load/common.cfg | 10 +++ .kokoro/load/load.cfg | 17 ++++ noxfile.py | 11 +++ scripts/create_load_test_tables.py | 109 +++++++++++++++++++++++++ tests/system/conftest.py | 8 +- tests/system/load/test_large_tables.py | 96 ++++++++++++++++++++++ 6 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 .kokoro/load/common.cfg create mode 100644 .kokoro/load/load.cfg create mode 100644 scripts/create_load_test_tables.py create mode 100644 tests/system/load/test_large_tables.py diff --git a/.kokoro/load/common.cfg b/.kokoro/load/common.cfg new file mode 100644 index 0000000000..97e0651aa9 --- /dev/null +++ b/.kokoro/load/common.cfg @@ -0,0 +1,10 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Build logs will be here +action { + define_artifacts { + regex: "**/*sponge_log.xml" + } +} + +build_file: "python-bigquery-dataframes/.kokoro/build.sh" diff --git a/.kokoro/load/load.cfg b/.kokoro/load/load.cfg new file mode 100644 index 0000000000..656614cf73 --- /dev/null +++ b/.kokoro/load/load.cfg @@ -0,0 +1,17 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Only run this nox session. +env_vars: { + key: "NOX_SESSION" + value: "load" +} + +env_vars: { + key: "GOOGLE_CLOUD_PROJECT" + value: "bigframes-load-testing" +} + +env_vars: { + key: "BIGFRAMES_TEST_MODEL_VERTEX_ENDPOINT" + value: "https://us-central1-aiplatform.googleapis.com/v1/projects/272725758477/locations/us-central1/endpoints/590545496255234048" +} diff --git a/noxfile.py b/noxfile.py index 91d26cf695..bcd39f961f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -387,6 +387,17 @@ def e2e(session: nox.sessions.Session): ) +@nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS[-1]) +def load(session: nox.sessions.Session): + """Run the very large tests in system test suite.""" + run_system( + session=session, + prefix_name="load", + test_folder=os.path.join("tests", "system", "load"), + print_duration=True, + ) + + @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def samples(session): """Run the samples test suite.""" diff --git a/scripts/create_load_test_tables.py b/scripts/create_load_test_tables.py new file mode 100644 index 0000000000..d94a33aa5c --- /dev/null +++ b/scripts/create_load_test_tables.py @@ -0,0 +1,109 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +import os +import pathlib +import sys + +import google.cloud.bigquery as bigquery + +REPO_ROOT = pathlib.Path(__file__).parent.parent + +PROJECT_ID = os.getenv("GOOGLE_CLOUD_PROJECT") + +if not PROJECT_ID: + print( + "Please set GOOGLE_CLOUD_PROJECT environment variable before running.", + file=sys.stderr, + ) + sys.exit(1) + +DATASET_ID = f"{PROJECT_ID}.load_testing" +TABLE_ID = f"{DATASET_ID}.scalars" +TABLE_ID_FORMAT = f"{DATASET_ID}.scalars_{{size}}" + +KB_BYTES = 1000 +MB_BYTES = 1000 * KB_BYTES +GB_BYTES = 1000 * MB_BYTES +TB_BYTES = 1000 * GB_BYTES +SIZES = ( + ("1mb", MB_BYTES), + ("10mb", 10 * MB_BYTES), + ("100mb", 100 * MB_BYTES), + ("1gb", GB_BYTES), + ("10gb", 10 * GB_BYTES), + ("100gb", 100 * GB_BYTES), + ("1tb", TB_BYTES), +) +SCHEMA_PATH = REPO_ROOT / "tests" / "data" / "scalars_schema.json" +DATA_PATH = REPO_ROOT / "tests" / "data" / "scalars.jsonl" +BQCLIENT = bigquery.Client() + + +def create_dataset(): + dataset = bigquery.Dataset(DATASET_ID) + BQCLIENT.create_dataset(dataset, exists_ok=True) + + +def load_scalars_table(): + schema = BQCLIENT.schema_from_json(SCHEMA_PATH) + job_config = bigquery.LoadJobConfig() + job_config.schema = schema + job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE + job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON + + print(f"Creating {TABLE_ID}") + with open(DATA_PATH, "rb") as data_file: + BQCLIENT.load_table_from_file( + data_file, + TABLE_ID, + job_config=job_config, + ).result() + + +def multiply_table(previous_table_id, target_table_id, multiplier): + clauses = [f"SELECT * FROM `{previous_table_id}`"] * multiplier + query = " UNION ALL ".join(clauses) + job_config = bigquery.QueryJobConfig() + job_config.destination = target_table_id + job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE + print(f"Creating {target_table_id}, {multiplier} x {previous_table_id}") + BQCLIENT.query_and_wait(query, job_config=job_config) + + +def create_tables(): + base_table = BQCLIENT.get_table(TABLE_ID) + previous_bytes = base_table.num_bytes + previous_table_id = TABLE_ID + + for table_suffix, target_bytes in SIZES: + # Make sure we exceed the desired bytes by adding to the multiplier. + multiplier = math.ceil(target_bytes / previous_bytes) + 1 + target_table_id = TABLE_ID_FORMAT.format(size=table_suffix) + multiply_table(previous_table_id, target_table_id, multiplier) + + table = BQCLIENT.get_table(target_table_id) + previous_bytes = table.num_bytes + previous_table_id = target_table_id + + +def main(): + create_dataset() + load_scalars_table() + create_tables() + + +if __name__ == "__main__": + main() diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 4aa27d6a19..7ca1882fe0 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -104,6 +104,11 @@ def cloudfunctions_client( return session.cloudfunctionsclient +@pytest.fixture(scope="session") +def project_id(bigquery_client: bigquery.Client) -> str: + return bigquery_client.project + + @pytest.fixture(scope="session") def resourcemanager_client( session: bigframes.Session, @@ -159,9 +164,8 @@ def dataset_id_not_created(bigquery_client: bigquery.Client): @pytest.fixture(scope="session") -def dataset_id_permanent(bigquery_client: bigquery.Client) -> str: +def dataset_id_permanent(bigquery_client: bigquery.Client, project_id: str) -> str: """Create a dataset if it doesn't exist.""" - project_id = bigquery_client.project dataset_id = f"{project_id}.{PERMANENT_DATASET}" dataset = bigquery.Dataset(dataset_id) bigquery_client.create_dataset(dataset, exists_ok=True) diff --git a/tests/system/load/test_large_tables.py b/tests/system/load/test_large_tables.py new file mode 100644 index 0000000000..1d4a6b0a5b --- /dev/null +++ b/tests/system/load/test_large_tables.py @@ -0,0 +1,96 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Load test for query (SQL) inputs with large results sizes.""" + +import pytest + +import bigframes.pandas as bpd + +KB_BYTES = 1000 +MB_BYTES = 1000 * KB_BYTES +GB_BYTES = 1000 * MB_BYTES +TB_BYTES = 1000 * GB_BYTES + + +@pytest.mark.parametrize( + ("sql", "expected_bytes"), + ( + pytest.param( + "SELECT * FROM load_testing.scalars_1gb", + GB_BYTES, + id="1gb", + ), + pytest.param( + "SELECT * FROM load_testing.scalars_10gb", + 10 * GB_BYTES, + id="10gb", + ), + pytest.param( + "SELECT * FROM load_testing.scalars_100gb", + 100 * GB_BYTES, + id="100gb", + ), + pytest.param( + "SELECT * FROM load_testing.scalars_1tb", + TB_BYTES, + id="1tb", + ), + ), +) +def test_read_gbq_sql_large_results(sql, expected_bytes): + df = bpd.read_gbq(sql) + assert df.memory_usage().sum() >= expected_bytes + + +def test_df_repr_large_table(): + df = bpd.read_gbq("load_testing.scalars_100gb") + row_count, column_count = df.shape + expected = f"[{row_count} rows x {column_count} columns]" + actual = repr(df) + assert expected in actual + + +def test_series_repr_large_table(): + df = bpd.read_gbq("load_testing.scalars_1tb") + actual = repr(df["string_col"]) + assert actual is not None + + +def test_index_repr_large_table(): + df = bpd.read_gbq("load_testing.scalars_1tb") + actual = repr(df.index) + assert actual is not None + + +# FAILED +# tests/system/load/test_large_tables.py::test_to_pandas_batches_large_table +# google.api_core.exceptions.Forbidden: 403 Response too large to return. +# Consider specifying a destination table in your job... +@pytest.mark.xfail +def test_to_pandas_batches_large_table(): + df = bpd.read_gbq("load_testing.scalars_100gb") + expected_row_count, expected_column_count = df.shape + + row_count = 0 + for df in df.to_pandas_batches(): + batch_row_count, batch_column_count = df.shape + assert batch_column_count == expected_column_count + row_count += batch_row_count + + # Attempt to save on memory by manually removing the batch df + # from local memory after finishing with processing. + del df + + assert row_count == expected_row_count From 1dd0f3eb88251203b94a894556867f4197e73343 Mon Sep 17 00:00:00 2001 From: Dan Lee <71398022+dandhlee@users.noreply.github.com> Date: Tue, 5 Mar 2024 19:20:20 -0500 Subject: [PATCH 03/21] chore: update toc entry to properly include summary pages (#415) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the files aren't present, docfx does not convert the files from `.yml` extension to `.html`. I've tested this locally to ensure docfx keeps the file extension to `.html` as needed, which we'll need for the new files added for summary pages. The only entry that should change in the future would be `summary_overview.html` to `summary_overview.md`. Filed #414 to keep track of that. 🦕 --- docs/templates/toc.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 0d6bec5534..66973fc5a2 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -4,13 +4,13 @@ - href: changelog.md name: Changelog - items: - - href: summary_overview.yml + - href: summary_overview.html name: Overview - - href: summary_class.yml + - href: summary_class.html name: Classes - - href: summary_method.yml + - href: summary_method.html name: Methods - - href: summary_property.yml + - href: summary_property.html name: Properties and Attributes name: BigQuery DataFrames API - items: From 5cde3990fac7b527a35c734d143f8b320b896eb8 Mon Sep 17 00:00:00 2001 From: "gcf-owl-bot[bot]" <78513119+gcf-owl-bot[bot]@users.noreply.github.com> Date: Wed, 6 Mar 2024 10:44:02 -0500 Subject: [PATCH 04/21] build(deps): bump cryptography from 42.0.2 to 42.0.4 in .kokoro (#395) Source-Link: https://github.com/googleapis/synthtool/commit/d895aec3679ad22aa120481f746bf9f2f325f26f Post-Processor: gcr.io/cloud-devrel-public-resources/owlbot-python:latest@sha256:98f3afd11308259de6e828e37376d18867fd321aba07826e29e4f8d9cab56bad Co-authored-by: Owl Bot --- .github/.OwlBot.lock.yaml | 4 +-- .kokoro/requirements.txt | 57 ++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/.github/.OwlBot.lock.yaml b/.github/.OwlBot.lock.yaml index d8a1bbca71..e4e943e025 100644 --- a/.github/.OwlBot.lock.yaml +++ b/.github/.OwlBot.lock.yaml @@ -13,5 +13,5 @@ # limitations under the License. docker: image: gcr.io/cloud-devrel-public-resources/owlbot-python:latest - digest: sha256:5ea6d0ab82c956b50962f91d94e206d3921537ae5fe1549ec5326381d8905cfa -# created: 2024-01-15T16:32:08.142785673Z + digest: sha256:98f3afd11308259de6e828e37376d18867fd321aba07826e29e4f8d9cab56bad +# created: 2024-02-27T15:56:18.442440378Z diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index bb3d6ca38b..bda8e38c4f 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -93,30 +93,39 @@ colorlog==6.7.0 \ # via # gcp-docuploader # nox -cryptography==41.0.6 \ - --hash=sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596 \ - --hash=sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c \ - --hash=sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660 \ - --hash=sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4 \ - --hash=sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead \ - --hash=sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed \ - --hash=sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3 \ - --hash=sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7 \ - --hash=sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09 \ - --hash=sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c \ - --hash=sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43 \ - --hash=sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65 \ - --hash=sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6 \ - --hash=sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da \ - --hash=sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c \ - --hash=sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b \ - --hash=sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8 \ - --hash=sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c \ - --hash=sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d \ - --hash=sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9 \ - --hash=sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86 \ - --hash=sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36 \ - --hash=sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae +cryptography==42.0.4 \ + --hash=sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b \ + --hash=sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce \ + --hash=sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88 \ + --hash=sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7 \ + --hash=sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20 \ + --hash=sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9 \ + --hash=sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff \ + --hash=sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1 \ + --hash=sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764 \ + --hash=sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b \ + --hash=sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298 \ + --hash=sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1 \ + --hash=sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824 \ + --hash=sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257 \ + --hash=sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a \ + --hash=sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129 \ + --hash=sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb \ + --hash=sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929 \ + --hash=sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854 \ + --hash=sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52 \ + --hash=sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923 \ + --hash=sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885 \ + --hash=sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0 \ + --hash=sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd \ + --hash=sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2 \ + --hash=sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18 \ + --hash=sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b \ + --hash=sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992 \ + --hash=sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74 \ + --hash=sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660 \ + --hash=sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925 \ + --hash=sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449 # via # gcp-releasetool # secretstorage From 31325a190320bf01ced53d9f4cdb94462daaa06b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=2C=20formerly=29?= Date: Wed, 6 Mar 2024 14:58:16 -0600 Subject: [PATCH 05/21] feat: add engine parameter to `read_parquet` (#413) This makes the default behavior in `read_parquet()` consistent with the other `read_XYZ()` methods. Pandas is used to parse the file and ordering is preserved by default. Use `engine="bigquery"` for the previous behavior (not order preserving). --- bigframes/pandas/__init__.py | 5 ++- bigframes/session/__init__.py | 31 +++++++++++++------ tests/system/small/test_session.py | 21 ++++++++++--- .../bigframes_vendored/pandas/io/parquet.py | 8 ++++- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 3c9bb003cc..3120e96b1a 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -597,10 +597,13 @@ def read_pickle( read_pickle.__doc__ = inspect.getdoc(bigframes.session.Session.read_pickle) -def read_parquet(path: str | IO["bytes"]) -> bigframes.dataframe.DataFrame: +def read_parquet( + path: str | IO["bytes"], *, engine: str = "auto" +) -> bigframes.dataframe.DataFrame: return global_session.with_default_session( bigframes.session.Session.read_parquet, path, + engine=engine, ) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index ef4a349244..4b30a3a9d1 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1130,19 +1130,32 @@ def read_pickle( def read_parquet( self, path: str | IO["bytes"], + *, + engine: str = "auto", ) -> dataframe.DataFrame: - # Note: "engine" is omitted because it is redundant. Loading a table - # from a pandas DataFrame will just create another parquet file + load - # job anyway. table = bigframes_io.random_table(self._anonymous_dataset) - job_config = bigquery.LoadJobConfig() - job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED - job_config.source_format = bigquery.SourceFormat.PARQUET - job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY - job_config.labels = {"bigframes-api": "read_parquet"} + if engine == "bigquery": + job_config = bigquery.LoadJobConfig() + job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED + job_config.source_format = bigquery.SourceFormat.PARQUET + job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY + job_config.labels = {"bigframes-api": "read_parquet"} - return self._read_bigquery_load_job(path, table, job_config=job_config) + return self._read_bigquery_load_job(path, table, job_config=job_config) + else: + read_parquet_kwargs: Dict[str, Any] = {} + if pandas.__version__.startswith("1."): + read_parquet_kwargs["use_nullable_dtypes"] = True + else: + read_parquet_kwargs["dtype_backend"] = "pyarrow" + + pandas_obj = pandas.read_parquet( + path, + engine=engine, # type: ignore + **read_parquet_kwargs, + ) + return self._read_pandas(pandas_obj, "read_parquet") def read_json( self, diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 85573472b9..2e2252be06 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -856,11 +856,19 @@ def test_read_pickle_gcs(session, penguins_pandas_df_default_index, gcs_folder): pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) -def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder): +@pytest.mark.parametrize( + ("engine",), + ( + ("auto",), + ("bigquery",), + ), +) +def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder, engine): scalars_df, _ = scalars_dfs # Include wildcard so that multiple files can be written/read if > 1 GB. # https://cloud.google.com/bigquery/docs/exporting-data#exporting_data_into_one_or_more_files path = gcs_folder + test_read_parquet_gcs.__name__ + "*.parquet" + df_in: bigframes.dataframe.DataFrame = scalars_df.copy() # GEOGRAPHY not supported in parquet export. df_in = df_in.drop(columns="geography_col") @@ -869,8 +877,12 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder): df_write.index.name = f"ordering_id_{random.randrange(1_000_000)}" df_write.to_parquet(path, index=True) + # Only bigquery engine for reads supports wildcards in path name. + if engine != "bigquery": + path = path.replace("*", "000000000000") + df_out = ( - session.read_parquet(path) + session.read_parquet(path, engine=engine) # Restore order. .set_index(df_write.index.name).sort_index() # Restore index. @@ -880,7 +892,8 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder): # DATETIME gets loaded as TIMESTAMP in parquet. See: # https://cloud.google.com/bigquery/docs/exporting-data#parquet_export_details df_out = df_out.assign( - datetime_col=df_out["datetime_col"].astype("timestamp[us][pyarrow]") + datetime_col=df_out["datetime_col"].astype("timestamp[us][pyarrow]"), + timestamp_col=df_out["timestamp_col"].astype("timestamp[us, tz=UTC][pyarrow]"), ) # Make sure we actually have at least some values before comparing. @@ -919,7 +932,7 @@ def test_read_parquet_gcs_compressed( df_write.to_parquet(path, compression=compression, index=True) df_out = ( - session.read_parquet(path) + session.read_parquet(path, engine="bigquery") # Restore order. .set_index(df_write.index.name).sort_index() # Restore index. diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 0f664e70fc..877a384b6d 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -9,6 +9,8 @@ class ParquetIOMixin: def read_parquet( self, path: str, + *, + engine: str = "auto", ): r"""Load a Parquet object from the file path (local or Cloud Storage), returning a DataFrame. @@ -23,11 +25,15 @@ def read_parquet( >>> bpd.options.display.progress_bar = None >>> gcs_path = "gs://cloud-samples-data/bigquery/us-states/us-states.parquet" - >>> df = bpd.read_parquet(path=gcs_path) + >>> df = bpd.read_parquet(path=gcs_path, engine="bigquery") Args: path (str): Local or Cloud Storage path to Parquet file. + engine (str): + One of ``'auto', 'pyarrow', 'fastparquet'``, or ``'bigquery'``. + Parquet library to parse the file. If set to ``'bigquery'``, + order is not preserved. Default, ``'auto'``. Returns: bigframes.dataframe.DataFrame: A BigQuery DataFrames. From 0b344023fe71384e49af9893e363a086f19e5258 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 7 Mar 2024 08:20:17 +0000 Subject: [PATCH 06/21] chore: materialize result of `remote_function` early (#408) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue 327662690 🦕 --- bigframes/series.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bigframes/series.py b/bigframes/series.py index 4aef959a76..dfa6fa4b0d 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -1253,11 +1253,17 @@ def apply( ex.message += f"\n{_remote_function_recommendation_message}" raise + # We are working with remote function at this point reprojected_series = Series(self._block._force_reproject()) - return reprojected_series._apply_unary_op( + result_series = reprojected_series._apply_unary_op( ops.RemoteFunctionOp(func=func, apply_on_null=True) ) + # return Series with materialized result so that any error in the remote + # function is caught early + materialized_series = result_series._cached() + return materialized_series + def add_prefix(self, prefix: str, axis: int | str | None = None) -> Series: return Series(self._get_block().add_prefix(prefix)) From 76b252f907055d72556e3e95f6cb5ee41de5b1c2 Mon Sep 17 00:00:00 2001 From: Duc Le Tu Date: Fri, 8 Mar 2024 00:36:16 +0700 Subject: [PATCH 07/21] fix: only do row identity based joins when joining by index (#356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [x] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [x] Ensure the tests and linter pass - [x] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) Fixes #355 🦕 --- bigframes/core/__init__.py | 2 +- bigframes/core/compile/single_column.py | 4 +- bigframes/core/nodes.py | 2 +- tests/system/conftest.py | 7 ++ .../test_issue355_merge_after_filter.py | 70 +++++++++++++++++++ 5 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 tests/system/small/regression/test_issue355_merge_after_filter.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 4dc2e4d7af..9032993452 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -349,7 +349,7 @@ def join( self, other: ArrayValue, join_def: join_def.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ): return ArrayValue( nodes.JoinNode( diff --git a/bigframes/core/compile/single_column.py b/bigframes/core/compile/single_column.py index d26e71d1b4..7beebfcb66 100644 --- a/bigframes/core/compile/single_column.py +++ b/bigframes/core/compile/single_column.py @@ -33,7 +33,7 @@ def join_by_column_ordered( left: compiled.OrderedIR, right: compiled.OrderedIR, join: join_defs.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ) -> compiled.OrderedIR: """Join two expressions by column equality. @@ -134,7 +134,7 @@ def join_by_column_unordered( left: compiled.UnorderedIR, right: compiled.UnorderedIR, join: join_defs.JoinDefinition, - allow_row_identity_join: bool = True, + allow_row_identity_join: bool = False, ) -> compiled.UnorderedIR: """Join two expressions by column equality. diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index f637177a94..1cd3277cbc 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -115,7 +115,7 @@ class JoinNode(BigFrameNode): left_child: BigFrameNode right_child: BigFrameNode join: JoinDefinition - allow_row_identity_join: bool = True + allow_row_identity_join: bool = False @property def row_preserving(self) -> bool: diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 7ca1882fe0..4b5ebc9d43 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -285,6 +285,13 @@ def scalars_table_id(test_data_tables) -> str: return test_data_tables["scalars"] +@pytest.fixture(scope="session") +def baseball_schedules_df(session: bigframes.Session) -> bigframes.dataframe.DataFrame: + """Public BQ table""" + df = session.read_gbq("bigquery-public-data.baseball.schedules") + return df + + @pytest.fixture(scope="session") def hockey_table_id(test_data_tables) -> str: return test_data_tables["hockey_players"] diff --git a/tests/system/small/regression/test_issue355_merge_after_filter.py b/tests/system/small/regression/test_issue355_merge_after_filter.py new file mode 100644 index 0000000000..24ee01cb7f --- /dev/null +++ b/tests/system/small/regression/test_issue355_merge_after_filter.py @@ -0,0 +1,70 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas as pd +import pytest + +from tests.system.utils import assert_pandas_df_equal + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_after_filter(baseball_schedules_df, merge_how): + on = ["awayTeamName"] + left_columns = [ + "gameId", + "year", + "homeTeamName", + "awayTeamName", + "duration_minutes", + ] + right_columns = [ + "gameId", + "year", + "homeTeamName", + "awayTeamName", + "duration_minutes", + ] + + left = baseball_schedules_df[left_columns] + left = left[left["homeTeamName"] == "Rays"] + # Offset the rows somewhat so that outer join can have an effect. + right = baseball_schedules_df[right_columns] + right = right[right["homeTeamName"] == "White Sox"] + + df = left.merge(right, on=on, how=merge_how) + bf_result = df.to_pandas() + + left_pandas = baseball_schedules_df.to_pandas()[left_columns] + left_pandas = left_pandas[left_pandas["homeTeamName"] == "Rays"] + + right_pandas = baseball_schedules_df.to_pandas()[right_columns] + right_pandas = right_pandas[right_pandas["homeTeamName"] == "White Sox"] + + pd_result = pd.merge( + left_pandas, + right_pandas, + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal(bf_result, pd_result, ignore_order=True) From 38bd2ba21bc1a3222635de22eecd97930bf5b1de Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Thu, 7 Mar 2024 10:38:40 -0800 Subject: [PATCH 08/21] docs: fix the note rendering for DataFrames methods: nlargest, nsmallest (#417) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes internal issue #328445384 🦕 --- .../bigframes_vendored/pandas/core/frame.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 84d2aa7fcb..f88649ca13 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -3935,6 +3935,11 @@ def nlargest(self, n: int, columns, keep: str = "first"): ``df.sort_values(columns, ascending=False).head(n)``, but more performant. + .. note:: + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + **Examples:** >>> import bigframes.pandas as bpd @@ -4002,11 +4007,6 @@ def nlargest(self, n: int, columns, keep: str = "first"): Returns: DataFrame: The first `n` rows ordered by the given columns in descending order. - - .. note:: - This function cannot be used with all column types. For example, when - specifying columns with `object` or `category` dtypes, ``TypeError`` is - raised. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -4022,6 +4022,12 @@ def nsmallest(self, n: int, columns, keep: str = "first"): ``df.sort_values(columns, ascending=True).head(n)``, but more performant. + .. note:: + + This function cannot be used with all column types. For example, when + specifying columns with `object` or `category` dtypes, ``TypeError`` is + raised. + **Examples:** >>> import bigframes.pandas as bpd @@ -4090,11 +4096,6 @@ def nsmallest(self, n: int, columns, keep: str = "first"): Returns: DataFrame: The first `n` rows ordered by the given columns in ascending order. - - .. note:: - This function cannot be used with all column types. For example, when - specifying columns with `object` or `category` dtypes, ``TypeError`` is - raised. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 36173b0c14747fb52909bbedd93249024bae9ac1 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Thu, 7 Mar 2024 22:06:15 +0000 Subject: [PATCH 09/21] docs: Document minimum IAM requirement (#416) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [x] Appropriate docs were updated (if necessary) - https://screenshot.googleplex.com/BPPQ6YVWYykCSus - https://screenshot.googleplex.com/8v3JYwcJJemSKRd Fixes internal issue #328086566 🦕 --- README.rst | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index f6d6f93e36..ad96382df8 100644 --- a/README.rst +++ b/README.rst @@ -34,6 +34,11 @@ Prerequisites `install and initialize the gcloud CLI `_, and then generate the application default credentials by doing `gcloud auth application-default login `_. +* The user must have + `BigQuery Job User `_ and + `BigQuery Read Session User `_ + roles for the minimum usage. Additional IAM requirements apply for using remote + functions and ML. Code sample ^^^^^^^^^^^ @@ -215,6 +220,30 @@ steps and an estimator together. to create a pipeline of transforms with a final estimator. +ML remote models +---------------- + +**Requirements** + +To use BigQuery DataFrames ML remote models (`bigframes.ml.remote` or `bigframes.ml.llm`), +you must enable the following APIs: + +* The BigQuery API (bigquery.googleapis.com) +* The BigQuery Connection API (bigqueryconnection.googleapis.com) +* The Vertex AI API (aiplatform.googleapis.com) + +and you must be granted the following IAM roles: + +* BigQuery Data Editor (roles/bigquery.dataEditor) +* BigQuery Connection Admin (roles/bigquery.connectionAdmin) +* Service Account User (roles/iam.serviceAccountUser) on the + `service account `__ + ``PROJECT_NUMBER-compute@developer.gserviceaccount.com`` +* Vertex AI User (roles/aiplatform.user) +* Project IAM Admin (roles/resourcemanager.projectIamAdmin) if using default + BigQuery connection, or Browser (roles/browser) if using a pre-created connection + + ML locations ------------ @@ -311,24 +340,8 @@ following IAM roles: `service account `__ ``PROJECT_NUMBER-compute@developer.gserviceaccount.com`` * Storage Object Viewer (roles/storage.objectViewer) -* Project IAM Admin (roles/resourcemanager.projectIamAdmin) - -To use BigQuery DataFrames ML remote models(bigframes.ml.remote or bigframes.ml.llm), you must enable the following APIs: - -* The BigQuery API (bigquery.googleapis.com) -* The BigQuery Connection API (bigqueryconnection.googleapis.com) -* The Vertex AI API (aiplatform.googleapis.com) - -To use BigQuery DataFrames ML remote models(bigframes.ml.remote or bigframes.ml.llm), you must be granted the -following IAM roles: - -* BigQuery Data Editor (roles/bigquery.dataEditor) -* BigQuery Connection Admin (roles/bigquery.connectionAdmin) -* Service Account User (roles/iam.serviceAccountUser) on the - `service account `__ - ``PROJECT_NUMBER-compute@developer.gserviceaccount.com`` -* Vertex AI User (roles/aiplatform.user) -* Project IAM Admin (roles/resourcemanager.projectIamAdmin) +* Project IAM Admin (roles/resourcemanager.projectIamAdmin) if using default + BigQuery connection, or Browser (roles/browser) if using a pre-created connection **Limitations** From 11a37433f99603e645499d9c1762ed7e65d364fb Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 7 Mar 2024 18:48:17 -0800 Subject: [PATCH 10/21] refactor: export operations use sesssion.execute path (#418) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/dataframe.py | 25 ++++++++++++++----------- bigframes/session/__init__.py | 10 ++++++++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d467239ea6..24c4699473 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2932,8 +2932,9 @@ def map_columns_on_occurrence(columns): return clustering_columns_for_index + clustering_columns_for_df - def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: - """Create query text representing this dataframe for I/O.""" + def _prepare_export( + self, index: bool, ordering_id: Optional[str] + ) -> Tuple[bigframes.core.ArrayValue, Dict[str, str]]: array_value = self._block.expr new_col_labels, new_idx_labels = utils.get_standardized_ids( @@ -2961,10 +2962,7 @@ def _create_io_query(self, index: bool, ordering_id: Optional[str]) -> str: if ordering_id is not None: array_value = array_value.promote_offsets(ordering_id) - return self._block.session._to_sql( - array_value=array_value, - col_id_overrides=id_overrides, - ) + return array_value, id_overrides def _run_io_query( self, @@ -2974,11 +2972,16 @@ def _run_io_query( ) -> bigquery.TableReference: """Executes a query job presenting this dataframe and returns the destination table.""" - expr = self._block.expr - session = expr.session - sql = self._create_io_query(index=index, ordering_id=ordering_id) - _, query_job = session._start_query( - sql=sql, job_config=job_config # type: ignore + session = self._block.expr.session + export_array, id_overrides = self._prepare_export( + index=index, ordering_id=ordering_id + ) + + _, query_job = session._execute( + export_array, + job_config=job_config, + sorted=False, + col_id_overrides=id_overrides, ) self._set_internal_query_job(query_job) diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 4b30a3a9d1..190ce17ee1 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -1626,9 +1626,15 @@ def _execute( *, sorted: bool = True, dry_run=False, + col_id_overrides: Mapping[str, str] = {}, ) -> tuple[bigquery.table.RowIterator, bigquery.QueryJob]: - sql = self._to_sql(array_value, sorted=sorted) # type:ignore - job_config = bigquery.QueryJobConfig(dry_run=dry_run) + sql = self._to_sql( + array_value, sorted=sorted, col_id_overrides=col_id_overrides + ) # type:ignore + if job_config is None: + job_config = bigquery.QueryJobConfig(dry_run=dry_run) + else: + job_config.dry_run = dry_run return self._start_query( sql=sql, job_config=job_config, From 6a3b0cc7f84120fc5978ce11b6b7c55e89654304 Mon Sep 17 00:00:00 2001 From: Stephanie A <129541811+DevStephanie@users.noreply.github.com> Date: Fri, 8 Mar 2024 11:19:52 -0600 Subject: [PATCH 11/21] docs: add predict sample to samples/snippets/bqml_getting_started_test.py (#388) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: Add a sample to demonstrate the evaluation results * Adding comments explaining logistic regression results * editing read_gbd explanation * docs: add predict sample to samples/snippets/bqml_getting_started_test.py * correcting variable names * Correcting python variables * feat: add predict by visit to samples/snippets/bqml_getting_started_test.py * file * file * file --------- Co-authored-by: Tim Sweña (Swast, formerly) --- samples/snippets/bqml_getting_started_test.py | 213 ++++++++++++++---- 1 file changed, 171 insertions(+), 42 deletions(-) diff --git a/samples/snippets/bqml_getting_started_test.py b/samples/snippets/bqml_getting_started_test.py index bb282fa563..d9f9135faa 100644 --- a/samples/snippets/bqml_getting_started_test.py +++ b/samples/snippets/bqml_getting_started_test.py @@ -14,7 +14,7 @@ def test_bqml_getting_started(random_model_id): - your_model_id = random_model_id + your_model_id = random_model_id # for example: bqml_tutorial.sample_model # [START bigquery_dataframes_bqml_getting_started_tutorial] from bigframes.ml.linear_model import LogisticRegression @@ -26,17 +26,12 @@ def test_bqml_getting_started(random_model_id): # https://github.com/googleapis/python-bigquery-dataframes/issues/169 # for updates to `read_gbq` to support wildcard tables. - df = bpd.read_gbq( - """ - -- Since the order of rows isn't useful for the model training, - -- generate a random ID to use as the index for the DataFrame. - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20160801' AND '20170630' - """, - index_col="rowindex", + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20160801"), + ("_table_suffix", "<=", "20170630"), + ], ) # Extract the total number of transactions within @@ -53,14 +48,14 @@ def test_bqml_getting_started(random_model_id): # ecommerce transactions within the Google Analytics session. # If the number of transactions is NULL, the value in the label # column is set to 0. Otherwise, it is set to 1. - label = transactions.notnull().map({True: 1, False: 0}) + label = transactions.notnull().map({True: 1, False: 0}).rename("label") # Extract the operating system of the visitor's device. - operatingSystem = df["device"].struct.field("operatingSystem") - operatingSystem = operatingSystem.fillna("") + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") # Extract whether the visitor's device is a mobile device. - isMobile = df["device"].struct.field("isMobile") + is_mobile = df["device"].struct.field("isMobile") # Extract the country from which the sessions originated, based on the IP address. country = df["geoNetwork"].struct.field("country").fillna("") @@ -72,8 +67,8 @@ def test_bqml_getting_started(random_model_id): # to use as training data. features = bpd.DataFrame( { - "os": operatingSystem, - "is_mobile": isMobile, + "os": operating_system, + "is_mobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -95,39 +90,36 @@ def test_bqml_getting_started(random_model_id): # [START bigquery_dataframes_bqml_getting_started_tutorial_evaluate] import bigframes.pandas as bpd - # Select model you'll use for training. `read_gbq_model` loads model data from a + # Select model you'll use for evaluating. `read_gbq_model` loads model data from a # BigQuery, but you could also use the `model` object from the previous steps. model = bpd.read_gbq_model( your_model_id, # For example: "bqml_tutorial.sample_model", ) - # The WHERE clause — _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' — - # limits the number of tables scanned by the query. The date range scanned is - # July 1, 2017 to August 1, 2017. This is the data you're using to evaluate the predictive performance - # of the model. It was collected in the month immediately following the time - # period spanned by the training data. - - df = bpd.read_gbq( - """ - SELECT GENERATE_UUID() AS rowindex, * - FROM - `bigquery-public-data.google_analytics_sample.ga_sessions_*` - WHERE - _TABLE_SUFFIX BETWEEN '20170701' AND '20170801' - """, - index_col="rowindex", + # The filters parameter limits the number of tables scanned by the query. + # The date range scanned is July 1, 2017 to August 1, 2017. This is the + # data you're using to evaluate the predictive performance of the model. + # It was collected in the month immediately following the time period + # spanned by the training data. + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], ) + transactions = df["totals"].struct.field("transactions") - label = transactions.notnull().map({True: 1, False: 0}) - operatingSystem = df["device"].struct.field("operatingSystem") - operatingSystem = operatingSystem.fillna("") - isMobile = df["device"].struct.field("isMobile") + label = transactions.notnull().map({True: 1, False: 0}).rename("label") + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") + is_mobile = df["device"].struct.field("isMobile") country = df["geoNetwork"].struct.field("country").fillna("") pageviews = df["totals"].struct.field("pageviews").fillna(0) features = bpd.DataFrame( { - "os": operatingSystem, - "is_mobile": isMobile, + "os": operating_system, + "is_mobile": is_mobile, "country": country, "pageviews": pageviews, } @@ -163,6 +155,143 @@ def test_bqml_getting_started(random_model_id): # [1 rows x 6 columns] # [END bigquery_dataframes_bqml_getting_started_tutorial_evaluate] - # [START bigquery_dataframes_bqml_getting_started_tutorial_predict] + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country] + import bigframes.pandas as bpd + + # Select model you'll use for predicting. + # `read_gbq_model` loads model data from + # BigQuery, but you could also use the `model` + # object from the previous steps. + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) + + # The filters parameter limits the number of tables scanned by the query. + # The date range scanned is July 1, 2017 to August 1, 2017. This is the + # data you're using to make the prediction. + # It was collected in the month immediately following the time period + # spanned by the training data. + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], + ) + + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") + is_mobile = df["device"].struct.field("isMobile") + country = df["geoNetwork"].struct.field("country").fillna("") + pageviews = df["totals"].struct.field("pageviews").fillna(0) + features = bpd.DataFrame( + { + "os": operating_system, + "is_mobile": is_mobile, + "country": country, + "pageviews": pageviews, + } + ) + # Use Logistic Regression predict method to predict results + # using your model. + # Find more information here in + # [BigFrames](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.ml.linear_model.LogisticRegression#bigframes_ml_linear_model_LogisticRegression_predict) + + predictions = model.predict(features) + + # Call groupby method to group predicted_label by country. + # Call sum method to get the total_predicted_label by country. + total_predicted_purchases = predictions.groupby(["country"])[ + ["predicted_label"] + ].sum() + + # Call the sort_values method with the parameter + # ascending = False to get the highest values. + # Call head method to limit to the 10 highest values. + total_predicted_purchases.sort_values(ascending=False).head(10) + + # country + # United States 220 + # Taiwan 8 + # Canada 7 + # India 2 + # Japan 2 + # Turkey 2 + # Australia 1 + # Brazil 1 + # Germany 1 + # Guyana 1 + # Name: predicted_label, dtype: Int64 + + # [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_country] + + # [START bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor] + + import bigframes.pandas as bpd + + # Select model you'll use for predicting. + # `read_gbq_model` loads model data from + # BigQuery, but you could also use the `model` + # object from the previous steps. + model = bpd.read_gbq_model( + your_model_id, # For example: "bqml_tutorial.sample_model", + ) + + # The filters parameter limits the number of tables scanned by the query. + # The date range scanned is July 1, 2017 to August 1, 2017. This is the + # data you're using to make the prediction. + # It was collected in the month immediately following the time period + # spanned by the training data. + df = bpd.read_gbq_table( + "bigquery-public-data.google_analytics_sample.ga_sessions_*", + filters=[ + ("_table_suffix", ">=", "20170701"), + ("_table_suffix", "<=", "20170801"), + ], + ) + + operating_system = df["device"].struct.field("operatingSystem") + operating_system = operating_system.fillna("") + is_mobile = df["device"].struct.field("isMobile") + country = df["geoNetwork"].struct.field("country").fillna("") + pageviews = df["totals"].struct.field("pageviews").fillna(0) + full_visitor_id = df["fullVisitorId"] + + features = bpd.DataFrame( + { + "os": operating_system, + "is_mobile": is_mobile, + "country": country, + "pageviews": pageviews, + "fullVisitorId": full_visitor_id, + } + ) + + predictions = model.predict(features) + + # Call groupby method to group predicted_label by visitor. + # Call sum method to get the total_predicted_label by visitor. + total_predicted_purchases = predictions.groupby(["fullVisitorId"])[ + ["predicted_label"] + ].sum() + + # Call the sort_values method with the parameter + # ascending = False to get the highest values. + # Call head method to limit to the 10 highest values. + total_predicted_purchases.sort_values(ascending=False).head(10) + + # fullVisitorId + # 9417857471295131045 4 + # 0376394056092189113 2 + # 0456807427403774085 2 + # 057693500927581077 2 + # 112288330928895942 2 + # 1280993661204347450 2 + # 2105122376016897629 2 + # 2158257269735455737 2 + # 2969418676126258798 2 + # 489038402765684003 2 + # Name: predicted_label, dtype: Int64 + - # [END bigquery_dataframes_bqml_getting_started_tutorial_predict] +# [END bigquery_dataframes_bqml_getting_started_tutorial_predict_by_visitor] From 815f578533fb1340296aca2be2083897f899a926 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=2C=20formerly=29?= Date: Fri, 8 Mar 2024 15:18:13 -0600 Subject: [PATCH 12/21] chore: increase timeout on load tests (#419) * chore: increase timeout on load tests * increase vm timeout too --- .kokoro/load/common.cfg | 1 + noxfile.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.kokoro/load/common.cfg b/.kokoro/load/common.cfg index 97e0651aa9..7f6fa7e0d9 100644 --- a/.kokoro/load/common.cfg +++ b/.kokoro/load/common.cfg @@ -8,3 +8,4 @@ action { } build_file: "python-bigquery-dataframes/.kokoro/build.sh" +timeout_mins: 360 diff --git a/noxfile.py b/noxfile.py index bcd39f961f..db503c43fd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -290,6 +290,7 @@ def run_system( install_test_extra=True, print_duration=False, extra_pytest_options=(), + timeout_seconds=900, ): """Run the system test suite.""" constraints_path = str( @@ -311,7 +312,7 @@ def run_system( "--quiet", "-n=20", # Any individual test taking longer than 15 mins will be terminated. - "--timeout=900", + f"--timeout={timeout_seconds}", # Log 20 slowest tests "--durations=20", f"--junitxml={prefix_name}_{session.python}_sponge_log.xml", @@ -395,6 +396,7 @@ def load(session: nox.sessions.Session): prefix_name="load", test_folder=os.path.join("tests", "system", "load"), print_duration=True, + timeout_seconds=60 * 60, ) From 9a678e35201d935e1d93875429005033cfe7cff6 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Sat, 9 Mar 2024 02:14:50 +0000 Subject: [PATCH 13/21] feat: Support CMEK for BQ tables (#403) * feat: Support CMEK for BQ tables * add more tests * add unit tests * add more tests, fix broken tests * separate bqml client to send kms_key_name via OPTIONS instead of job config * fix unit tests * fix mypy * skip cmek test for empty cmek * move staticmethods to helper module * revert bqmlclient, pass cmek through call time job config * revert bqmlclient unit test * fix mypy failure * use better named key, disable use_query_cache in test * rename bqml create model internal method * fix renamed methods's reference in unit tests * remove stray bqmlclient variable --- bigframes/_config/bigquery_options.py | 25 ++ bigframes/ml/core.py | 10 +- bigframes/pandas/__init__.py | 1 + bigframes/session/__init__.py | 126 +++++++--- bigframes/session/_io/bigquery.py | 39 ++- bigframes/session/clients.py | 46 ++-- tests/system/small/test_encryption.py | 256 ++++++++++++++++++++ tests/unit/_config/test_bigquery_options.py | 2 + tests/unit/ml/test_golden_sql.py | 11 +- tests/unit/session/test_clients.py | 1 + 10 files changed, 450 insertions(+), 67 deletions(-) create mode 100644 tests/system/small/test_encryption.py diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index 74b83429d0..34701740f6 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -39,6 +39,7 @@ def __init__( bq_connection: Optional[str] = None, use_regional_endpoints: bool = False, application_name: Optional[str] = None, + kms_key_name: Optional[str] = None, ): self._credentials = credentials self._project = project @@ -46,6 +47,7 @@ def __init__( self._bq_connection = bq_connection self._use_regional_endpoints = use_regional_endpoints self._application_name = application_name + self._kms_key_name = kms_key_name self._session_started = False @property @@ -148,3 +150,26 @@ def use_regional_endpoints(self, value: bool): ) self._use_regional_endpoints = value + + @property + def kms_key_name(self) -> Optional[str]: + """Customer managed encryption key used to control encryption of the + data-at-rest in BigQuery. This is of the format + projects/PROJECT_ID/locations/LOCATION/keyRings/KEYRING/cryptoKeys/KEY + + See https://cloud.google.com/bigquery/docs/customer-managed-encryption + for more details. + + Please make sure the project used for Bigquery DataFrames has "Cloud KMS + CryptoKey Encrypter/Decrypter" role in the key's project, See + https://cloud.google.com/bigquery/docs/customer-managed-encryption#assign_role + for steps on how to ensure that. + """ + return self._kms_key_name + + @kms_key_name.setter + def kms_key_name(self, value: str): + if self._session_started and self._kms_key_name != value: + raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="kms_key_name")) + + self._kms_key_name = value diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index c496133aa7..24997708fb 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -212,7 +212,8 @@ def principal_component_info(self) -> bpd.DataFrame: return self._session.read_gbq(sql) def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: - job_config = bigquery.job.CopyJobConfig() + job_config = self._session._prepare_copy_job_config() + if replace: job_config.write_disposition = "WRITE_TRUNCATE" @@ -236,7 +237,7 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: options={"vertex_ai_model_id": vertex_ai_model_id} ) # Register the model and wait it to finish - self._session._start_query(sql) + self._session._start_query_create_model(sql) self._model = self._session.bqclient.get_model(self.model_name) return self @@ -255,7 +256,7 @@ def _create_model_ref( def _create_model_with_sql(self, session: bigframes.Session, sql: str) -> BqmlModel: # fit the model, synchronously - _, job = session._start_query(sql) + _, job = session._start_query_create_model(sql) # real model path in the session specific hidden dataset and table prefix model_name_full = f"{job.destination.project}.{job.destination.dataset_id}.{job.destination.table_id}" @@ -298,6 +299,9 @@ def create_model( options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) session = X_train._session + if session._bq_kms_key_name: + options.update({"kms_key_name": session._bq_kms_key_name}) + model_ref = self._create_model_ref(session._anonymous_dataset) sql = self._model_creation_sql_generator.create_model( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 3120e96b1a..195d7eabfa 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -383,6 +383,7 @@ def _set_default_session_location_if_possible(query): use_regional_endpoints=options.bigquery.use_regional_endpoints, credentials=options.bigquery.credentials, application_name=options.bigquery.application_name, + bq_kms_key_name=options.bigquery.kms_key_name, ) bqclient = clients_provider.bqclient diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index 190ce17ee1..b553865ea9 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -65,7 +65,6 @@ import bigframes._config.bigquery_options as bigquery_options import bigframes.constants as constants -from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.core.compile @@ -84,7 +83,6 @@ # Even though the ibis.backends.bigquery import is unused, it's needed # to register new and replacement ops with the Ibis BigQuery backend. -import third_party.bigframes_vendored.ibis.backends.bigquery # noqa import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet @@ -161,6 +159,8 @@ def __init__( else: self._location = context.location + self._bq_kms_key_name = context.kms_key_name + # Instantiate a clients provider to help with cloud clients that will be # used in the future operations in the session if clients_provider: @@ -172,9 +172,17 @@ def __init__( use_regional_endpoints=context.use_regional_endpoints, credentials=context.credentials, application_name=context.application_name, + bq_kms_key_name=self._bq_kms_key_name, ) self._create_bq_datasets() + + # TODO(shobs): Remove this logic after https://github.com/ibis-project/ibis/issues/8494 + # has been fixed. The ibis client changes the default query job config + # so we are going to remember the current config and restore it after + # the ibis client has been created + original_default_query_job_config = self.bqclient.default_query_job_config + self.ibis_client = typing.cast( ibis_bigquery.Backend, ibis.bigquery.connect( @@ -184,6 +192,9 @@ def __init__( ), ) + self.bqclient.default_query_job_config = original_default_query_job_config + + # Resolve the BQ connection for remote function and Vertex AI integration self._bq_connection = context.bq_connection or _BIGFRAMES_DEFAULT_CONNECTION_ID # Now that we're starting the session, don't allow the options to be @@ -929,6 +940,8 @@ def _read_pandas_load_job( pandas_dataframe_copy.columns = pandas.Index(new_col_ids) pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) + job_config = self._prepare_load_job_config() + # Specify the datetime dtypes, which is auto-detected as timestamp types. schema: list[bigquery.SchemaField] = [] for column, dtype in zip(pandas_dataframe.columns, pandas_dataframe.dtypes): @@ -936,12 +949,12 @@ def _read_pandas_load_job( schema.append( bigquery.SchemaField(column, bigquery.enums.SqlTypeNames.DATETIME) ) + job_config.schema = schema # Clustering probably not needed anyways as pandas tables are small cluster_cols = [ordering_col] - - job_config = bigquery.LoadJobConfig(schema=schema) job_config.clustering_fields = cluster_cols + job_config.labels = {"bigframes-api": api_name} load_table_destination = bigframes_io.random_table(self._anonymous_dataset) @@ -1061,7 +1074,7 @@ def read_csv( f"{constants.FEEDBACK_LINK}" ) - job_config = bigquery.LoadJobConfig() + job_config = self._prepare_load_job_config() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.CSV job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY @@ -1136,7 +1149,7 @@ def read_parquet( table = bigframes_io.random_table(self._anonymous_dataset) if engine == "bigquery": - job_config = bigquery.LoadJobConfig() + job_config = self._prepare_load_job_config() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.PARQUET job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY @@ -1194,7 +1207,7 @@ def read_json( "'lines' keyword is only valid when 'orient' is 'records'." ) - job_config = bigquery.LoadJobConfig() + job_config = self._prepare_load_job_config() job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY @@ -1518,6 +1531,53 @@ def read_gbq_function( session=self, ) + def _prepare_query_job_config( + self, + job_config: Optional[bigquery.QueryJobConfig] = None, + ) -> bigquery.QueryJobConfig: + if job_config is None: + job_config = bigquery.QueryJobConfig() + else: + # Create a copy so that we don't mutate the original config passed + job_config = typing.cast( + bigquery.QueryJobConfig, + bigquery.QueryJobConfig.from_api_repr(job_config.to_api_repr()), + ) + + if bigframes.options.compute.maximum_bytes_billed is not None: + job_config.maximum_bytes_billed = ( + bigframes.options.compute.maximum_bytes_billed + ) + + if self._bq_kms_key_name: + job_config.destination_encryption_configuration = ( + bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name) + ) + + return job_config + + def _prepare_load_job_config(self) -> bigquery.LoadJobConfig: + # Create a copy so that we don't mutate the original config passed + job_config = bigquery.LoadJobConfig() + + if self._bq_kms_key_name: + job_config.destination_encryption_configuration = ( + bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name) + ) + + return job_config + + def _prepare_copy_job_config(self) -> bigquery.CopyJobConfig: + # Create a copy so that we don't mutate the original config passed + job_config = bigquery.CopyJobConfig() + + if self._bq_kms_key_name: + job_config.destination_encryption_configuration = ( + bigquery.EncryptionConfiguration(kms_key_name=self._bq_kms_key_name) + ) + + return job_config + def _start_query( self, sql: str, @@ -1525,29 +1585,30 @@ def _start_query( max_results: Optional[int] = None, ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: """ - Starts query job and waits for results. + Starts BigQuery query job and waits for results. """ - job_config = self._prepare_job_config(job_config) - api_methods = log_adapter.get_and_reset_api_methods() - job_config.labels = bigframes_io.create_job_configs_labels( - job_configs_labels=job_config.labels, api_methods=api_methods + job_config = self._prepare_query_job_config(job_config) + return bigframes.session._io.bigquery.start_query_with_client( + self.bqclient, sql, job_config, max_results ) - try: - query_job = self.bqclient.query(sql, job_config=job_config) - except google.api_core.exceptions.Forbidden as ex: - if "Drive credentials" in ex.message: - ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." - raise + def _start_query_create_model( + self, + sql: str, + ) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """ + Starts BigQuery ML CREATE MODEL query job and waits for results. + """ + job_config = self._prepare_query_job_config() - opts = bigframes.options.display - if opts.progress_bar is not None and not query_job.configuration.dry_run: - results_iterator = formatting_helpers.wait_for_query_job( - query_job, max_results, opts.progress_bar - ) - else: - results_iterator = query_job.result(max_results=max_results) - return results_iterator, query_job + # BQML expects kms_key_name through OPTIONS and not through job config, + # so we must reset any encryption set in the job config + # https://cloud.google.com/bigquery/docs/customer-managed-encryption#encrypt-model + job_config.destination_encryption_configuration = None + + return bigframes.session._io.bigquery.start_query_with_client( + self.bqclient, sql, job_config + ) def _cache_with_cluster_cols( self, array_value: core.ArrayValue, cluster_cols: typing.Sequence[str] @@ -1696,19 +1757,6 @@ def _start_generic_job(self, job: formatting_helpers.GenericJob): else: job.result() - def _prepare_job_config( - self, job_config: Optional[bigquery.QueryJobConfig] = None - ) -> bigquery.QueryJobConfig: - if job_config is None: - job_config = self.bqclient.default_query_job_config - if job_config is None: - job_config = bigquery.QueryJobConfig() - if bigframes.options.compute.maximum_bytes_billed is not None: - job_config.maximum_bytes_billed = ( - bigframes.options.compute.maximum_bytes_billed - ) - return job_config - def connect(context: Optional[bigquery_options.BigQueryOptions] = None) -> Session: return Session(context) diff --git a/bigframes/session/_io/bigquery.py b/bigframes/session/_io/bigquery.py index 3695fc98e8..67820bbbcb 100644 --- a/bigframes/session/_io/bigquery.py +++ b/bigframes/session/_io/bigquery.py @@ -20,11 +20,17 @@ import itertools import textwrap import types -from typing import Dict, Iterable, Optional, Sequence, Union +from typing import Dict, Iterable, Optional, Sequence, Tuple, Union import uuid +import google.api_core.exceptions import google.cloud.bigquery as bigquery +import bigframes +from bigframes.core import log_adapter +import bigframes.formatting_helpers as formatting_helpers +import bigframes.session._io.bigquery as bigframes_io + IO_ORDERING_ID = "bqdf_row_nums" MAX_LABELS_COUNT = 64 TEMP_TABLE_PREFIX = "bqdf{date}_{random_id}" @@ -207,3 +213,34 @@ def format_option(key: str, value: Union[bool, str]) -> str: if isinstance(value, bool): return f"{key}=true" if value else f"{key}=false" return f"{key}={repr(value)}" + + +def start_query_with_client( + bq_client: bigquery.Client, + sql: str, + job_config: bigquery.job.QueryJobConfig, + max_results: Optional[int] = None, +) -> Tuple[bigquery.table.RowIterator, bigquery.QueryJob]: + """ + Starts query job and waits for results. + """ + api_methods = log_adapter.get_and_reset_api_methods() + job_config.labels = bigframes_io.create_job_configs_labels( + job_configs_labels=job_config.labels, api_methods=api_methods + ) + + try: + query_job = bq_client.query(sql, job_config=job_config) + except google.api_core.exceptions.Forbidden as ex: + if "Drive credentials" in ex.message: + ex.message += "\nCheck https://cloud.google.com/bigquery/docs/query-drive-data#Google_Drive_permissions." + raise + + opts = bigframes.options.display + if opts.progress_bar is not None and not query_job.configuration.dry_run: + results_iterator = formatting_helpers.wait_for_query_job( + query_job, max_results, opts.progress_bar + ) + else: + results_iterator = query_job.result(max_results=max_results) + return results_iterator, query_job diff --git a/bigframes/session/clients.py b/bigframes/session/clients.py index 627c9258a6..7574aa4454 100644 --- a/bigframes/session/clients.py +++ b/bigframes/session/clients.py @@ -68,6 +68,7 @@ def __init__( use_regional_endpoints: Optional[bool], credentials: Optional[google.auth.credentials.Credentials], application_name: Optional[str], + bq_kms_key_name: Optional[str], ): credentials_project = None if credentials is None: @@ -98,6 +99,7 @@ def __init__( self._location = location self._use_regional_endpoints = use_regional_endpoints self._credentials = credentials + self._bq_kms_key_name = bq_kms_key_name # cloud clients initialized for lazy load self._bqclient = None @@ -106,28 +108,34 @@ def __init__( self._cloudfunctionsclient = None self._resourcemanagerclient = None + def _create_bigquery_client(self): + bq_options = None + if self._use_regional_endpoints: + bq_options = google.api_core.client_options.ClientOptions( + api_endpoint=( + _BIGQUERY_REGIONAL_ENDPOINT + if self._location.lower() in _REP_SUPPORTED_REGIONS + else _BIGQUERY_LOCATIONAL_ENDPOINT + ).format(location=self._location), + ) + bq_info = google.api_core.client_info.ClientInfo( + user_agent=self._application_name + ) + + bq_client = bigquery.Client( + client_info=bq_info, + client_options=bq_options, + credentials=self._credentials, + project=self._project, + location=self._location, + ) + + return bq_client + @property def bqclient(self): if not self._bqclient: - bq_options = None - if self._use_regional_endpoints: - bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=( - _BIGQUERY_REGIONAL_ENDPOINT - if self._location.lower() in _REP_SUPPORTED_REGIONS - else _BIGQUERY_LOCATIONAL_ENDPOINT - ).format(location=self._location), - ) - bq_info = google.api_core.client_info.ClientInfo( - user_agent=self._application_name - ) - self._bqclient = bigquery.Client( - client_info=bq_info, - client_options=bq_options, - credentials=self._credentials, - project=self._project, - location=self._location, - ) + self._bqclient = self._create_bigquery_client() return self._bqclient diff --git a/tests/system/small/test_encryption.py b/tests/system/small/test_encryption.py new file mode 100644 index 0000000000..0ce9d881fd --- /dev/null +++ b/tests/system/small/test_encryption.py @@ -0,0 +1,256 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from google.cloud import bigquery +import pandas +import pytest + +import bigframes +import bigframes.ml.linear_model + + +@pytest.fixture(scope="module") +def bq_cmek() -> str: + """Customer managed encryption key to encrypt BigQuery data at rest. + + This is of the form projects/PROJECT_ID/locations/LOCATION/keyRings/KEY_RING/cryptoKeys/KEY + + See https://cloud.google.com/bigquery/docs/customer-managed-encryption for steps. + """ + + # NOTE: This key is manually set up through the cloud console + # TODO(shobs): Automate the the key creation during the test. This will + # require extra IAM privileges for the test runner. + return "projects/bigframes-dev-perf/locations/us/keyRings/bigframesKeyRing/cryptoKeys/bigframesKey" + + +@pytest.fixture(scope="module") +def session_with_bq_cmek(bq_cmek) -> bigframes.Session: + session = bigframes.Session(bigframes.BigQueryOptions(kms_key_name=bq_cmek)) + + return session + + +def _assert_bq_table_is_encrypted( + df: bigframes.dataframe.DataFrame, + cmek: str, + session: bigframes.Session, +): + # Materialize the data in BQ + repr(df) + + # The df should be backed by a query job with intended encryption on the result table + assert df.query_job is not None + assert df.query_job.destination_encryption_configuration.kms_key_name.startswith( + cmek + ) + + # The result table should exist with the intended encryption + table = session.bqclient.get_table(df.query_job.destination) + assert table.encryption_configuration.kms_key_name == cmek + + +def test_session_query_job(bq_cmek, session_with_bq_cmek): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + _, query_job = session_with_bq_cmek._start_query( + "SELECT 123", job_config=bigquery.QueryJobConfig(use_query_cache=False) + ) + query_job.result() + + assert query_job.destination_encryption_configuration.kms_key_name.startswith( + bq_cmek + ) + + # The result table should exist with the intended encryption + table = session_with_bq_cmek.bqclient.get_table(query_job.destination) + assert table.encryption_configuration.kms_key_name == bq_cmek + + +def test_session_load_job(bq_cmek, session_with_bq_cmek): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Session should have cmek set in the default query and load job configs + load_table = bigframes.session._io.bigquery.random_table( + session_with_bq_cmek._anonymous_dataset + ) + + df = pandas.DataFrame({"col0": [1, 2, 3]}) + load_job_config = session_with_bq_cmek._prepare_load_job_config() + load_job_config.schema = [ + bigquery.SchemaField(df.columns[0], bigquery.enums.SqlTypeNames.INT64) + ] + + load_job = session_with_bq_cmek.bqclient.load_table_from_dataframe( + df, + load_table, + job_config=load_job_config, + ) + load_job.result() + + assert load_job.destination == load_table + assert load_job.destination_encryption_configuration.kms_key_name.startswith( + bq_cmek + ) + + # The load destination table should be created with the intended encryption + table = session_with_bq_cmek.bqclient.get_table(load_job.destination) + assert table.encryption_configuration.kms_key_name == bq_cmek + + +def test_read_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Read the BQ table + df = session_with_bq_cmek.read_gbq(scalars_table_id) + + # Assert encryption + _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) + + +def test_df_apis(bq_cmek, session_with_bq_cmek, scalars_table_id): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Read a BQ table and assert encryption + df = session_with_bq_cmek.read_gbq(scalars_table_id) + + # Perform a few dataframe operations and assert assertion + df1 = df.dropna() + _assert_bq_table_is_encrypted(df1, bq_cmek, session_with_bq_cmek) + + df2 = df1.head() + _assert_bq_table_is_encrypted(df2, bq_cmek, session_with_bq_cmek) + + +@pytest.mark.parametrize( + "engine", + [ + pytest.param("bigquery", id="bq_engine"), + pytest.param( + None, + id="default_engine", + marks=pytest.mark.skip( + reason="Internal issue 327544164, cmek does not propagate to the dataframe." + ), + ), + ], +) +def test_read_csv_gcs( + bq_cmek, session_with_bq_cmek, scalars_df_index, gcs_folder, engine +): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Create a csv in gcs + write_path = gcs_folder + "test_read_csv_gcs_bigquery_engine*.csv" + read_path = ( + write_path.replace("*", "000000000000") if engine is None else write_path + ) + scalars_df_index.to_csv(write_path) + + # Read the BQ table + df = session_with_bq_cmek.read_csv(read_path, engine=engine) + + # Assert encryption + _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) + + +def test_to_gbq(bq_cmek, session_with_bq_cmek, scalars_table_id): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Read a BQ table and assert encryption + df = session_with_bq_cmek.read_gbq(scalars_table_id) + _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) + + # Modify the dataframe and assert assertion + df = df.dropna().head() + _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) + + # Write the result to BQ and assert assertion + output_table_id = df.to_gbq() + output_table = session_with_bq_cmek.bqclient.get_table(output_table_id) + assert output_table.encryption_configuration.kms_key_name == bq_cmek + + +@pytest.mark.skip( + reason="Internal issue 327544164, cmek does not propagate to the dataframe." +) +def test_read_pandas(bq_cmek, session_with_bq_cmek): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Read a pandas dataframe + df = session_with_bq_cmek.read_pandas(pandas.DataFrame([1])) + + # Assert encryption + _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) + + +def test_read_pandas_large(bq_cmek, session_with_bq_cmek): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + # Read a pandas dataframe large enough to trigger a BQ load job + df = session_with_bq_cmek.read_pandas(pandas.DataFrame(range(10_000))) + + # Assert encryption + _assert_bq_table_is_encrypted(df, bq_cmek, session_with_bq_cmek) + + +def test_bqml(bq_cmek, session_with_bq_cmek, penguins_table_id): + if not bq_cmek: + pytest.skip("no cmek set for testing") + + model = bigframes.ml.linear_model.LinearRegression() + df = session_with_bq_cmek.read_gbq(penguins_table_id).dropna() + X_train = df[ + [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "sex", + ] + ] + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) + + assert model is not None + assert model._bqml_model.model.encryption_configuration is not None + assert model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek + + # Assert that model exists in BQ with intended encryption + model_bq = session_with_bq_cmek.bqclient.get_model(model._bqml_model.model_name) + assert model_bq.encryption_configuration.kms_key_name == bq_cmek + + # Explicitly save the model to a destination and assert that encryption holds + model_ref = model._bqml_model_factory._create_model_ref( + session_with_bq_cmek._anonymous_dataset + ) + model_ref_full_name = ( + f"{model_ref.project}.{model_ref.dataset_id}.{model_ref.model_id}" + ) + new_model = model.to_gbq(model_ref_full_name) + assert new_model._bqml_model.model.encryption_configuration.kms_key_name == bq_cmek + + # Assert that model exists in BQ with intended encryption + model_bq = session_with_bq_cmek.bqclient.get_model(new_model._bqml_model.model_name) + assert model_bq.encryption_configuration.kms_key_name == bq_cmek diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index e5b6cfe2f1..1ce70e3da2 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -29,6 +29,7 @@ ("project", "my-project", "my-other-project"), ("bq_connection", "path/to/connection/1", "path/to/connection/2"), ("use_regional_endpoints", False, True), + ("kms_key_name", "kms/key/name/1", "kms/key/name/2"), ], ) def test_setter_raises_if_session_started(attribute, original_value, new_value): @@ -61,6 +62,7 @@ def test_setter_raises_if_session_started(attribute, original_value, new_value): "project", "bq_connection", "use_regional_endpoints", + "bq_kms_key_name", ] ], ) diff --git a/tests/unit/ml/test_golden_sql.py b/tests/unit/ml/test_golden_sql.py index 017c96d46d..25e12d87bf 100644 --- a/tests/unit/ml/test_golden_sql.py +++ b/tests/unit/ml/test_golden_sql.py @@ -35,6 +35,7 @@ def mock_session(): mock_session._anonymous_dataset = bigquery.DatasetReference( TEMP_MODEL_ID.project, TEMP_MODEL_ID.dataset_id ) + mock_session._bq_kms_key_name = None query_job = mock.create_autospec(bigquery.QueryJob) type(query_job).destination = mock.PropertyMock( @@ -42,7 +43,7 @@ def mock_session(): mock_session._anonymous_dataset, TEMP_MODEL_ID.model_id ) ) - mock_session._start_query.return_value = (None, query_job) + mock_session._start_query_create_model.return_value = (None, query_job) return mock_session @@ -103,7 +104,7 @@ def test_linear_regression_default_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query.assert_called_once_with( + mock_session._start_query_create_model.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=True,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -113,7 +114,7 @@ def test_linear_regression_params_fit(bqml_model_factory, mock_session, mock_X, model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query.assert_called_once_with( + mock_session._start_query_create_model.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LINEAR_REG",\n data_split_method="NO_SPLIT",\n optimize_strategy="normal_equation",\n fit_intercept=False,\n l2_reg=0.0,\n max_iterations=20,\n learn_rate_strategy="line_search",\n early_stop=True,\n min_rel_progress=0.01,\n ls_init_learn_rate=0.1,\n calculate_p_values=False,\n enable_global_explain=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -146,7 +147,7 @@ def test_logistic_regression_default_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query.assert_called_once_with( + mock_session._start_query_create_model.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=True,\n auto_class_weights=False,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) @@ -160,7 +161,7 @@ def test_logistic_regression_params_fit( model._bqml_model_factory = bqml_model_factory model.fit(mock_X, mock_y) - mock_session._start_query.assert_called_once_with( + mock_session._start_query_create_model.assert_called_once_with( 'CREATE OR REPLACE MODEL `test-project`.`_anon123`.`temp_model_id`\nOPTIONS(\n model_type="LOGISTIC_REG",\n data_split_method="NO_SPLIT",\n fit_intercept=False,\n auto_class_weights=True,\n INPUT_LABEL_COLS=["input_column_label"])\nAS input_X_y_sql' ) diff --git a/tests/unit/session/test_clients.py b/tests/unit/session/test_clients.py index f1b2a5045a..30ba2f9091 100644 --- a/tests/unit/session/test_clients.py +++ b/tests/unit/session/test_clients.py @@ -38,6 +38,7 @@ def create_clients_provider(application_name: Optional[str] = None): use_regional_endpoints=False, credentials=credentials, application_name=application_name, + bq_kms_key_name="projects/my-project/locations/us/keyRings/myKeyRing/cryptoKeys/myKey", ) From 60594f4011ff72617932f37c1e53d4d3ba683ce4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Mon, 11 Mar 2024 09:52:16 -0500 Subject: [PATCH 14/21] chore: don't require branch to be synced with main to merge (#425) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many recent changes (e.g. docs) don't need to be synced with `main` to be pretty confident they are safe to merge. Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- .github/sync-repo-settings.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml index cfa62f787c..80c73d991c 100644 --- a/.github/sync-repo-settings.yaml +++ b/.github/sync-repo-settings.yaml @@ -5,7 +5,7 @@ branchProtectionRules: # Defaults to `main` - pattern: main requiresCodeOwnerReviews: true - requiresStrictStatusChecks: true + requiresStrictStatusChecks: false requiredStatusCheckContexts: - 'conventionalcommits.org' - 'cla/google' From 4aadff4db59243b4510a874fef2bdb17402d1674 Mon Sep 17 00:00:00 2001 From: Chelsea Lin <124939984+chelsea-lin@users.noreply.github.com> Date: Mon, 11 Mar 2024 16:44:20 -0700 Subject: [PATCH 15/21] feat: (Series|Dataframe).plot.hist() (#420) * feat: (Series|Dataframe).plot.hist() --- bigframes/dataframe.py | 5 + bigframes/operations/_matplotlib/__init__.py | 30 +++ bigframes/operations/_matplotlib/core.py | 30 +++ bigframes/operations/_matplotlib/hist.py | 172 ++++++++++++++++++ bigframes/operations/plotting.py | 34 ++++ bigframes/series.py | 5 + docs/reference/bigframes.pandas/frame.rst | 11 ++ docs/reference/bigframes.pandas/series.rst | 9 + setup.py | 1 + testing/constraints-3.9.txt | 1 + tests/system/small/operations/test_plot.py | 168 +++++++++++++++++ .../bigframes_vendored/pandas/core/frame.py | 11 ++ .../bigframes_vendored/pandas/core/series.py | 11 ++ .../pandas/plotting/_core.py | 48 +++++ 14 files changed, 536 insertions(+) create mode 100644 bigframes/operations/_matplotlib/__init__.py create mode 100644 bigframes/operations/_matplotlib/core.py create mode 100644 bigframes/operations/_matplotlib/hist.py create mode 100644 bigframes/operations/plotting.py create mode 100644 tests/system/small/operations/test_plot.py create mode 100644 third_party/bigframes_vendored/pandas/plotting/_core.py diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 24c4699473..a122212d04 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -59,6 +59,7 @@ import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops +import bigframes.operations.plotting as plotting import bigframes.series import bigframes.series as bf_series import bigframes.session._io.bigquery @@ -3193,4 +3194,8 @@ def get_right_id(id): return result + @property + def plot(self): + return plotting.PlotAccessor(self) + __matmul__ = dot diff --git a/bigframes/operations/_matplotlib/__init__.py b/bigframes/operations/_matplotlib/__init__.py new file mode 100644 index 0000000000..f8770a9ef8 --- /dev/null +++ b/bigframes/operations/_matplotlib/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bigframes.operations._matplotlib.core as core +import bigframes.operations._matplotlib.hist as hist + +PLOT_CLASSES: dict[str, type[core.MPLPlot]] = { + "hist": hist.HistPlot, +} + + +def plot(data, kind, **kwargs): + plot_obj = PLOT_CLASSES[kind](data, **kwargs) + plot_obj.generate() + plot_obj.draw() + return plot_obj.result + + +__all__ = ["plot"] diff --git a/bigframes/operations/_matplotlib/core.py b/bigframes/operations/_matplotlib/core.py new file mode 100644 index 0000000000..4b15d6f4dd --- /dev/null +++ b/bigframes/operations/_matplotlib/core.py @@ -0,0 +1,30 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import abc + +import matplotlib.pyplot as plt + + +class MPLPlot(abc.ABC): + @abc.abstractmethod + def generate(self): + pass + + def draw(self) -> None: + plt.draw_if_interactive() + + @property + def result(self): + return self.axes diff --git a/bigframes/operations/_matplotlib/hist.py b/bigframes/operations/_matplotlib/hist.py new file mode 100644 index 0000000000..720b94d7da --- /dev/null +++ b/bigframes/operations/_matplotlib/hist.py @@ -0,0 +1,172 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from typing import Literal + +import numpy as np +import pandas as pd + +import bigframes.constants as constants +import bigframes.operations._matplotlib.core as bfplt + + +class HistPlot(bfplt.MPLPlot): + @property + def _kind(self) -> Literal["hist"]: + return "hist" + + def __init__( + self, + data, + bins: int = 10, + **kwargs, + ) -> None: + self.bins = bins + self.label = kwargs.get("label", None) + self.by = kwargs.pop("by", None) + self.kwargs = kwargs + + if self.by is not None: + raise NotImplementedError( + f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}" + ) + if not isinstance(self.bins, int): + raise NotImplementedError( + f"Only integer values are supported for the `bins` argument. {constants.FEEDBACK_LINK}" + ) + if kwargs.get("weight", None) is not None: + raise NotImplementedError( + f"Non-none `weight` argument is not yet supported. {constants.FEEDBACK_LINK}" + ) + + self.data = self._compute_plot_data(data) + + def generate(self) -> None: + """ + Calculates weighted histograms through BigQuery and plots them through pandas + native histogram plot. + """ + hist_bars = self._calculate_hist_bars(self.data, self.bins) + bin_edges = self._calculate_bin_edges( + hist_bars, self.bins, self.kwargs.get("range", None) + ) + + weights = { + col_name: hist_bar.values for col_name, hist_bar in hist_bars.items() + } + hist_x = { + col_name: pd.Series( + ( + hist_bar.index.get_level_values("left_exclusive") + + hist_bar.index.get_level_values("right_inclusive") + ) + / 2.0 + ) + for col_name, hist_bar in hist_bars.items() + } + + # Align DataFrames for plotting despite potential differences in column + # lengths, filling shorter columns with zeros. + hist_x_pd = pd.DataFrame( + list(itertools.zip_longest(*hist_x.values())), columns=list(hist_x.keys()) + ).sort_index(axis=1)[self.data.columns.values] + weights_pd = pd.DataFrame( + list(itertools.zip_longest(*weights.values())), columns=list(weights.keys()) + ).sort_index(axis=1)[self.data.columns.values] + + # Prevents pandas from dropping NA values and causing length mismatches by + # filling them with zeros. + hist_x_pd.fillna(0, inplace=True) + weights_pd.fillna(0, inplace=True) + + self.axes = hist_x_pd.plot.hist( + bins=bin_edges, + weights=np.array(weights_pd.values), + **self.kwargs, + ) # type: ignore + + def _compute_plot_data(self, data): + """ + Prepares data for plotting, focusing on numeric data types. + + Raises: + TypeError: If the input data contains no numeric columns. + """ + # Importing at the top of the file causes a circular import. + import bigframes.series as series + + if isinstance(data, series.Series): + label = self.label + if label is None and data.name is None: + label = "" + if label is None: + data = data.to_frame() + else: + data = data.to_frame(name=label) + + # TODO(chelsealin): Support timestamp/date types here. + include_type = ["number"] + numeric_data = data.select_dtypes(include=include_type) + try: + is_empty = numeric_data.columns.empty + except AttributeError: + is_empty = not len(numeric_data) + + if is_empty: + raise TypeError("no numeric data to plot") + + return numeric_data + + @staticmethod + def _calculate_hist_bars(data, bins): + """ + Calculates histogram bars for each column in a BigFrames DataFrame, and + returns a dictionary where keys are column names and values are pandas + Series. The series values are the histogram bins' heights with a + multi-index defining 'left_exclusive' and 'right_inclusive' bin edges. + """ + import bigframes.pandas as bpd + + # TODO: Optimize this by batching multiple jobs into one. + hist_bar = {} + for _, col in enumerate(data.columns): + cutted_data = bpd.cut(data[col], bins=bins, labels=None) + hist_bar[col] = ( + cutted_data.struct.explode() + .value_counts() + .to_pandas() + .sort_index(level="left_exclusive") + ) + return hist_bar + + @staticmethod + def _calculate_bin_edges(hist_bars, bins, range): + """ + Calculate bin edges from the histogram bars. + """ + bin_edges = None + for _, hist_bar in hist_bars.items(): + left = hist_bar.index.get_level_values("left_exclusive") + right = hist_bar.index.get_level_values("right_inclusive") + if bin_edges is None: + bin_edges = left.union(right) + else: + bin_edges = left.union(right).union(bin_edges) + + if bin_edges is None: + return None + + _, bins = np.histogram(bin_edges, bins=bins, range=range) + return bins diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py new file mode 100644 index 0000000000..ef36e9383a --- /dev/null +++ b/bigframes/operations/plotting.py @@ -0,0 +1,34 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Sequence + +import bigframes.constants as constants +import bigframes.operations._matplotlib as bfplt +import third_party.bigframes_vendored.pandas.plotting._core as vendordt + + +class PlotAccessor: + __doc__ = vendordt.PlotAccessor.__doc__ + + def __init__(self, data) -> None: + self._parent = data + + def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): + if kwargs.pop("backend", None) is not None: + raise NotImplementedError( + f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}" + ) + # Calls matplotlib backend to plot the data. + return bfplt.plot(self._parent.copy(), kind="hist", by=by, bins=bins, **kwargs) diff --git a/bigframes/series.py b/bigframes/series.py index dfa6fa4b0d..21f1f3b4e4 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -50,6 +50,7 @@ import bigframes.operations.aggregations as agg_ops import bigframes.operations.base import bigframes.operations.datetimes as dt +import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings import bigframes.operations.structs as structs import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series @@ -1557,6 +1558,10 @@ def __array_ufunc__( def str(self) -> strings.StringMethods: return strings.StringMethods(self._block) + @property + def plot(self): + return plotting.PlotAccessor(self) + def _slice( self, start: typing.Optional[int] = None, diff --git a/docs/reference/bigframes.pandas/frame.rst b/docs/reference/bigframes.pandas/frame.rst index a49bcc8f7c..d1610accdd 100644 --- a/docs/reference/bigframes.pandas/frame.rst +++ b/docs/reference/bigframes.pandas/frame.rst @@ -7,3 +7,14 @@ DataFrame :members: :inherited-members: :undoc-members: + +Accessors +--------- + +Plotting handling +^^^^^^^^^^^^^^^^^ + +.. automodule:: bigframes.operations.plotting + :members: + :inherited-members: + :undoc-members: diff --git a/docs/reference/bigframes.pandas/series.rst b/docs/reference/bigframes.pandas/series.rst index e212904f3f..f14eb8e862 100644 --- a/docs/reference/bigframes.pandas/series.rst +++ b/docs/reference/bigframes.pandas/series.rst @@ -42,3 +42,12 @@ Struct handling :members: :inherited-members: :undoc-members: + +Plotting handling +^^^^^^^^^^^^^^^^^ + +.. automodule:: bigframes.operations.plotting + :members: + :inherited-members: + :undoc-members: + :noindex: diff --git a/setup.py b/setup.py index 516d5b8a19..027c1b76af 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ "tabulate >= 0.9", "ipywidgets >=7.7.1", "humanize >= 4.6.0", + "matplotlib >= 3.7.1", ] extras = { # Optional test dependencies packages. If they're missed, may skip some tests. diff --git a/testing/constraints-3.9.txt b/testing/constraints-3.9.txt index c4fed64fbd..07c8b763f3 100644 --- a/testing/constraints-3.9.txt +++ b/testing/constraints-3.9.txt @@ -20,5 +20,6 @@ sqlglot==20.8.0 tabulate==0.9 ipywidgets==7.7.1 humanize==4.6.0 +matplotlib==3.7.1 # extras pandas-gbq==0.19.0 diff --git a/tests/system/small/operations/test_plot.py b/tests/system/small/operations/test_plot.py new file mode 100644 index 0000000000..44f31ec071 --- /dev/null +++ b/tests/system/small/operations/test_plot.py @@ -0,0 +1,168 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pandas._testing as tm +import pytest + + +def _check_legend_labels(ax, labels): + """ + Check the ax has expected legend label + """ + assert ax.get_legend() is not None + texts = ax.get_legend().get_texts() + if not isinstance(texts, list): + assert texts.get_text() == labels + else: + actual_labels = [t.get_text() for t in texts] + assert len(actual_labels) == len(labels) + for label, e in zip(actual_labels, labels): + assert label == e + + +def test_series_hist_bins(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bins = 5 + ax = scalars_df["int64_col"].plot.hist(bins=bins) + pd_ax = scalars_pandas_df["int64_col"].plot.hist(bins=bins) + + # Compares axis values and height between bigframes and pandas histograms. + # Note: Due to potential float rounding by matplotlib, this test may not + # be applied to all cases. + assert len(ax.patches) == len(pd_ax.patches) + for i in range(len(ax.patches)): + assert ax.patches[i].xy == pd_ax.patches[i].xy + assert ax.patches[i]._height == pd_ax.patches[i]._height + + +def test_dataframes_hist_bins(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bins = 7 + columns = ["int64_col", "int64_too", "float64_col"] + ax = scalars_df[columns].plot.hist(bins=bins) + pd_ax = scalars_pandas_df[columns].plot.hist(bins=bins) + + # Compares axis values and height between bigframes and pandas histograms. + # Note: Due to potential float rounding by matplotlib, this test may not + # be applied to all cases. + assert len(ax.patches) == len(pd_ax.patches) + for i in range(len(ax.patches)): + assert ax.patches[i]._height == pd_ax.patches[i]._height + + +@pytest.mark.parametrize( + ("col_names"), + [ + pytest.param(["int64_col"]), + pytest.param(["float64_col"]), + pytest.param(["int64_too", "bool_col"]), + pytest.param(["bool_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["date_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["datetime_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["time_col"], marks=pytest.mark.xfail(raises=TypeError)), + pytest.param(["timestamp_col"], marks=pytest.mark.xfail(raises=TypeError)), + ], +) +def test_hist_include_types(scalars_dfs, col_names): + scalars_df, _ = scalars_dfs + ax = scalars_df[col_names].plot.hist() + assert len(ax.patches) == 10 + + +@pytest.mark.parametrize( + ("arg_name", "arg_value"), + [ + pytest.param( + "by", ["int64_col"], marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + "bins", [1, 3, 5], marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + "weight", [2, 3], marks=pytest.mark.xfail(raises=NotImplementedError) + ), + pytest.param( + "backend", + "backend.module", + marks=pytest.mark.xfail(raises=NotImplementedError), + ), + ], +) +def test_hist_not_implemented_error(scalars_dfs, arg_name, arg_value): + scalars_df, _ = scalars_dfs + kwargs = {arg_name: arg_value} + scalars_df.plot.hist(**kwargs) + + +def test_hist_kwargs_true_subplots(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + axes = scalars_df[columns].plot.hist(subplots=True) + pd_axes = scalars_pandas_df[columns].plot.hist(subplots=True) + assert len(axes) == len(pd_axes) + + expected_labels = (["int64_col"], ["int64_too"], ["float64_col"]) + for ax, labels in zip(axes, expected_labels): + _check_legend_labels(ax, labels) + + +def test_hist_kwargs_list_subplots(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + columns = ["int64_col", "int64_too", "float64_col"] + subplots = [["int64_col", "int64_too"]] + axes = scalars_df[columns].plot.hist(subplots=subplots) + pd_axes = scalars_pandas_df[columns].plot.hist(subplots=subplots) + assert len(axes) == len(pd_axes) + + expected_labels = (["int64_col", "int64_too"], ["float64_col"]) + for ax, labels in zip(axes, expected_labels): + _check_legend_labels(ax, labels=labels) + + +@pytest.mark.parametrize( + ("orientation"), + [ + pytest.param("horizontal"), + pytest.param("vertical"), + ], +) +def test_hist_kwargs_orientation(scalars_dfs, orientation): + scalars_df, scalars_pandas_df = scalars_dfs + ax = scalars_df["int64_col"].plot.hist(orientation=orientation) + pd_ax = scalars_pandas_df["int64_col"].plot.hist(orientation=orientation) + assert ax.xaxis.get_label().get_text() == pd_ax.xaxis.get_label().get_text() + assert ax.yaxis.get_label().get_text() == pd_ax.yaxis.get_label().get_text() + + +def test_hist_kwargs_ticks_props(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + xticks = [20, 18] + yticks = [30, 40] + + ax = scalars_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks) + pd_ax = scalars_pandas_df["float64_col"].plot.hist(xticks=xticks, yticks=yticks) + xlabels = ax.get_xticklabels() + pd_xlables = pd_ax.get_xticklabels() + assert len(xlabels) == len(pd_xlables) + for i in range(len(pd_xlables)): + tm.assert_almost_equal(xlabels[i].get_fontsize(), pd_xlables[i].get_fontsize()) + tm.assert_almost_equal(xlabels[i].get_rotation(), pd_xlables[i].get_rotation()) + + ylabels = ax.get_yticklabels() + pd_ylables = pd_ax.get_yticklabels() + assert len(xlabels) == len(pd_xlables) + for i in range(len(pd_xlables)): + tm.assert_almost_equal(ylabels[i].get_fontsize(), pd_ylables[i].get_fontsize()) + tm.assert_almost_equal(ylabels[i].get_rotation(), pd_ylables[i].get_rotation()) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index f88649ca13..0399d9c5b9 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -5224,3 +5224,14 @@ def dot(self, other): the matrix product of self and other in a DataFrame. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def plot(self): + """ + Make plots of Dataframes. + + Returns: + bigframes.operations.plotting.PlotAccessor: + An accessor making plots. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 6c01a6dd0c..2c4f2aaa8f 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -3111,6 +3111,17 @@ def str(self): """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + @property + def plot(self): + """ + Make plots of Series. + + Returns: + bigframes.operations.plotting.PlotAccessor: + An accessor making plots. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isin(self, values): """ Whether elements in Series are contained in values. diff --git a/third_party/bigframes_vendored/pandas/plotting/_core.py b/third_party/bigframes_vendored/pandas/plotting/_core.py new file mode 100644 index 0000000000..d0425737ee --- /dev/null +++ b/third_party/bigframes_vendored/pandas/plotting/_core.py @@ -0,0 +1,48 @@ +from typing import Optional, Sequence + +from bigframes import constants + + +class PlotAccessor: + """ + Make plots of Series or DataFrame with the `matplotlib` backend. + """ + + def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs): + """ + Draw one histogram of the DataFrame’s columns. + + A histogram is a representation of the distribution of data. + This function groups the values of all given Series in the DataFrame + into bins and draws all bins in one :class:`matplotlib.axes.Axes`. + This is useful when the DataFrame's Series are in a similar scale. + + Parameters + ---------- + by : str or sequence, optional + Column in the DataFrame to group by. It is not supported yet. + bins : int, default 10 + Number of histogram bins to be used. + **kwargs + Additional keyword arguments are documented in + :meth:`DataFrame.plot`. + + Returns + ------- + class:`matplotlib.AxesSubplot` + Return a histogram plot. + + Examples + -------- + For Series: + + .. plot:: + :context: close-figs + + >>> import bigframes.pandas as bpd + >>> import numpy as np + >>> df = bpd.DataFrame(np.random.randint(1, 7, 6000), columns=['one']) + >>> df['two'] = np.random.randint(1, 7, 6000) + np.random.randint(1, 7, 6000) + >>> ax = df.plot.hist(bins=12, alpha=0.5) + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 763edeb4f4e8bc4b8bb05a992dae80c49c245e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim=20Swe=C3=B1a=20=28Swast=29?= Date: Tue, 12 Mar 2024 12:46:16 -0500 Subject: [PATCH 16/21] fix: move `third_party.bigframes_vendored` to `bigframes_vendored` (#424) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will avoid potential conflicts when run from a location that contains the commonly named `third_party` directory. Note: I followed the instructions here: https://stackoverflow.com/a/17179022/101923 but I also had to add a `pyproject.toml` file to support editable installations (see: https://togithub.com/pypa/setuptools/issues/230#issuecomment-1473278299). Fixes internal issue 328781348. 🦕 --- MANIFEST.in | 2 +- bigframes/_config/__init__.py | 3 +- bigframes/_config/display_options.py | 3 +- bigframes/_config/sampling_options.py | 2 +- bigframes/core/blocks.py | 2 +- bigframes/core/compile/aggregate_compiler.py | 2 +- bigframes/core/groupby/__init__.py | 2 +- bigframes/core/indexes/index.py | 2 +- bigframes/core/tools/datetimes.py | 2 +- bigframes/core/utils.py | 3 +- bigframes/core/window/__init__.py | 3 +- bigframes/dataframe.py | 4 +- bigframes/dtypes.py | 6 +-- bigframes/functions/remote_function.py | 2 +- bigframes/ml/base.py | 7 +-- bigframes/ml/cluster.py | 6 +-- bigframes/ml/compose.py | 7 +-- bigframes/ml/decomposition.py | 6 +-- bigframes/ml/ensemble.py | 24 +++++------ bigframes/ml/linear_model.py | 14 +++--- bigframes/ml/metrics/_metrics.py | 6 +-- bigframes/ml/metrics/pairwise.py | 3 +- bigframes/ml/pipeline.py | 6 +-- bigframes/ml/preprocessing.py | 43 ++++++++----------- bigframes/operations/base.py | 2 +- bigframes/operations/datetimes.py | 3 +- bigframes/operations/plotting.py | 3 +- bigframes/operations/strings.py | 3 +- bigframes/operations/structs.py | 3 +- bigframes/pandas/__init__.py | 12 +++--- bigframes/py.typed | 0 bigframes/series.py | 2 +- bigframes/session/__init__.py | 16 +++---- noxfile.py | 5 ++- owlbot.py | 2 +- pyproject.toml | 3 ++ setup.py | 10 ++++- tests/system/small/test_ibis.py | 2 +- tests/unit/test_remote_function.py | 2 +- .../tests/unit/test_pandas_helpers.py | 2 +- .../ibis/backends/bigquery/__init__.py | 4 +- .../ibis/backends/bigquery/registry.py | 3 +- .../ibis/expr/operations/__init__.py | 6 +-- .../bigframes_vendored/pandas/core/frame.py | 2 +- .../bigframes_vendored/pandas/core/generic.py | 3 +- .../bigframes_vendored/pandas/core/series.py | 6 +-- third_party/bigframes_vendored/py.typed | 0 .../sklearn/cluster/_kmeans.py | 3 +- .../sklearn/compose/_column_transformer.py | 3 +- .../sklearn/decomposition/_pca.py | 3 +- .../sklearn/linear_model/_base.py | 5 ++- .../sklearn/linear_model/_logistic.py | 7 ++- .../bigframes_vendored/sklearn/pipeline.py | 3 +- .../sklearn/preprocessing/_data.py | 3 +- .../sklearn/preprocessing/_discretization.py | 3 +- .../sklearn/preprocessing/_encoder.py | 3 +- .../sklearn/preprocessing/_label.py | 3 +- 57 files changed, 152 insertions(+), 138 deletions(-) create mode 100644 bigframes/py.typed create mode 100644 pyproject.toml create mode 100644 third_party/bigframes_vendored/py.typed diff --git a/MANIFEST.in b/MANIFEST.in index b422266a96..02b1f4ba4b 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -16,7 +16,7 @@ # Generated by synthtool. DO NOT EDIT! include README.rst LICENSE -recursive-include third_party * +recursive-include third_party/bigframes_vendored * recursive-include bigframes *.json *.proto py.typed recursive-include tests * global-exclude *.py[co] diff --git a/bigframes/_config/__init__.py b/bigframes/_config/__init__.py index 8dcebfce6a..bdd7a8f2d6 100644 --- a/bigframes/_config/__init__.py +++ b/bigframes/_config/__init__.py @@ -17,11 +17,12 @@ DataFrames from this package. """ +import bigframes_vendored.pandas._config.config as pandas_config + import bigframes._config.bigquery_options as bigquery_options import bigframes._config.compute_options as compute_options import bigframes._config.display_options as display_options import bigframes._config.sampling_options as sampling_options -import third_party.bigframes_vendored.pandas._config.config as pandas_config class Options: diff --git a/bigframes/_config/display_options.py b/bigframes/_config/display_options.py index afa36aa84c..2af07d30a8 100644 --- a/bigframes/_config/display_options.py +++ b/bigframes/_config/display_options.py @@ -18,10 +18,9 @@ import dataclasses from typing import Literal, Optional +import bigframes_vendored.pandas.core.config_init as vendored_pandas_config import pandas as pd -import third_party.bigframes_vendored.pandas.core.config_init as vendored_pandas_config - @dataclasses.dataclass class DisplayOptions: diff --git a/bigframes/_config/sampling_options.py b/bigframes/_config/sampling_options.py index a80b9601ca..f4fa0928e1 100644 --- a/bigframes/_config/sampling_options.py +++ b/bigframes/_config/sampling_options.py @@ -19,7 +19,7 @@ import dataclasses from typing import Literal, Optional -import third_party.bigframes_vendored.pandas.core.config_init as vendored_pandas_config +import bigframes_vendored.pandas.core.config_init as vendored_pandas_config @dataclasses.dataclass diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 993f2caa47..93dcd1d691 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -29,6 +29,7 @@ from typing import Iterable, List, Mapping, Optional, Sequence, Tuple import warnings +import bigframes_vendored.pandas.io.common as vendored_pandas_io_common import google.cloud.bigquery as bigquery import pandas as pd @@ -45,7 +46,6 @@ import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops import bigframes.session._io.pandas -import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common # Type constraint for wherever column labels are used Label = typing.Hashable diff --git a/bigframes/core/compile/aggregate_compiler.py b/bigframes/core/compile/aggregate_compiler.py index 86ba16e347..7059c4fdc1 100644 --- a/bigframes/core/compile/aggregate_compiler.py +++ b/bigframes/core/compile/aggregate_compiler.py @@ -15,6 +15,7 @@ import typing from typing import cast, Optional +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis import ibis.expr.datatypes as ibis_dtypes import ibis.expr.types as ibis_types @@ -26,7 +27,6 @@ import bigframes.core.window_spec as window_spec import bigframes.dtypes as dtypes import bigframes.operations.aggregations as agg_ops -import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops scalar_compiler = scalar_compilers.scalar_op_compiler diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 9a0889b041..837eb28f68 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -16,6 +16,7 @@ import typing +import bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby import pandas as pd import bigframes.constants as constants @@ -30,7 +31,6 @@ import bigframes.dtypes as dtypes import bigframes.operations.aggregations as agg_ops import bigframes.series as series -import third_party.bigframes_vendored.pandas.core.groupby as vendored_pandas_groupby @log_adapter.class_logger diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 3ae4fbe24a..328dd49397 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -19,6 +19,7 @@ import typing from typing import Hashable, Optional, Sequence, Union +import bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index import google.cloud.bigquery as bigquery import numpy as np import pandas @@ -33,7 +34,6 @@ import bigframes.formatting_helpers as formatter import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops -import third_party.bigframes_vendored.pandas.core.indexes.base as vendored_pandas_index if typing.TYPE_CHECKING: import bigframes.dataframe diff --git a/bigframes/core/tools/datetimes.py b/bigframes/core/tools/datetimes.py index 093fa0a670..4aaf320c7a 100644 --- a/bigframes/core/tools/datetimes.py +++ b/bigframes/core/tools/datetimes.py @@ -16,6 +16,7 @@ from datetime import datetime from typing import Optional, Union +import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes import pandas as pd import bigframes.constants as constants @@ -23,7 +24,6 @@ import bigframes.dataframe import bigframes.operations as ops import bigframes.series -import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes def to_datetime( diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 4331999dd6..1976ec1e39 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -15,11 +15,10 @@ import typing from typing import Hashable, Iterable, List +import bigframes_vendored.pandas.io.common as vendored_pandas_io_common import pandas as pd import typing_extensions -import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common - UNNAMED_COLUMN_ID = "bigframes_unnamed_column" UNNAMED_INDEX_ID = "bigframes_unnamed_index" diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index 8711625f88..fb682c950e 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -16,11 +16,12 @@ import typing +import bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling + from bigframes.core import log_adapter import bigframes.core as core import bigframes.core.blocks as blocks import bigframes.operations.aggregations as agg_ops -import third_party.bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling @log_adapter.class_logger diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index a122212d04..6ed882987c 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -34,6 +34,8 @@ Union, ) +import bigframes_vendored.pandas.core.frame as vendored_pandas_frame +import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import google.api_core.exceptions import google.cloud.bigquery as bigquery import numpy @@ -63,8 +65,6 @@ import bigframes.series import bigframes.series as bf_series import bigframes.session._io.bigquery -import third_party.bigframes_vendored.pandas.core.frame as vendored_pandas_frame -import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing if typing.TYPE_CHECKING: import bigframes.session diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 8a2055ef7f..f29d653d4f 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -20,6 +20,9 @@ import typing from typing import Any, Dict, Iterable, Literal, Tuple, Union +import bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers +import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import geopandas as gpd # type: ignore import google.cloud.bigquery as bigquery import ibis @@ -31,9 +34,6 @@ import pyarrow as pa import bigframes.constants as constants -import third_party.bigframes_vendored.google_cloud_bigquery._pandas_helpers as gcb3p_pandas_helpers -import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes -import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops # Type hints for Pandas dtypes supported by BigQuery DataFrame Dtype = Union[ diff --git a/bigframes/functions/remote_function.py b/bigframes/functions/remote_function.py index 5bc8291f59..29c1c68e7c 100644 --- a/bigframes/functions/remote_function.py +++ b/bigframes/functions/remote_function.py @@ -32,6 +32,7 @@ if TYPE_CHECKING: from bigframes.session import Session +import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes import cloudpickle import google.api_core.exceptions import google.api_core.retry @@ -47,7 +48,6 @@ from bigframes import clients import bigframes.constants as constants import bigframes.dtypes -import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes logger = logging.getLogger(__name__) diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index 845b64caf1..9001987e9a 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -24,12 +24,13 @@ import abc from typing import cast, Optional, TypeVar, Union +import bigframes_vendored.sklearn.base + from bigframes.ml import core import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.base -class BaseEstimator(third_party.bigframes_vendored.sklearn.base.BaseEstimator, abc.ABC): +class BaseEstimator(bigframes_vendored.sklearn.base.BaseEstimator, abc.ABC): """ A BigQuery DataFrames machine learning component following the SKLearn API design Ref: https://bit.ly/3NyhKjN @@ -80,7 +81,7 @@ def __repr__(self): # Estimator pretty printer adapted from Sklearn's, which is in turn an adaption of # the inbuilt pretty-printer in CPython - import third_party.bigframes_vendored.cpython._pprint as adapted_pprint + import bigframes_vendored.cpython._pprint as adapted_pprint prettyprinter = adapted_pprint._EstimatorPrettyPrinter( compact=True, indent=1, indent_at_name=True, n_max_elements_to_show=30 diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 6b79d356a2..360ab01453 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -19,22 +19,22 @@ from typing import Dict, List, Optional, Union +import bigframes_vendored.sklearn.cluster._kmeans from google.cloud import bigquery import bigframes from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.cluster._kmeans @log_adapter.class_logger class KMeans( base.UnsupervisedTrainablePredictor, - third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans, + bigframes_vendored.sklearn.cluster._kmeans.KMeans, ): - __doc__ = third_party.bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__ + __doc__ = bigframes_vendored.sklearn.cluster._kmeans.KMeans.__doc__ def __init__(self, n_clusters: int = 8): self.n_clusters = n_clusters diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index ace876dd2d..d35941b338 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -21,11 +21,12 @@ import typing from typing import List, Optional, Tuple, Union +import bigframes_vendored.sklearn.compose._column_transformer + from bigframes import constants from bigframes.core import log_adapter from bigframes.ml import base, core, globals, preprocessing, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.compose._column_transformer CompilablePreprocessorType = Union[ preprocessing.OneHotEncoder, @@ -40,10 +41,10 @@ @log_adapter.class_logger class ColumnTransformer( base.Transformer, - third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, + bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, ): __doc__ = ( - third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer.__doc__ + bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer.__doc__ ) def __init__( diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index ef777cb33a..f2b7c97994 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -19,21 +19,21 @@ from typing import List, Optional, Union +import bigframes_vendored.sklearn.decomposition._pca from google.cloud import bigquery import bigframes from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.decomposition._pca @log_adapter.class_logger class PCA( base.UnsupervisedTrainablePredictor, - third_party.bigframes_vendored.sklearn.decomposition._pca.PCA, + bigframes_vendored.sklearn.decomposition._pca.PCA, ): - __doc__ = third_party.bigframes_vendored.sklearn.decomposition._pca.PCA.__doc__ + __doc__ = bigframes_vendored.sklearn.decomposition._pca.PCA.__doc__ def __init__(self, n_components: int = 3): self.n_components = n_components diff --git a/bigframes/ml/ensemble.py b/bigframes/ml/ensemble.py index 7fcaa926ed..23b227de67 100644 --- a/bigframes/ml/ensemble.py +++ b/bigframes/ml/ensemble.py @@ -19,14 +19,14 @@ from typing import Dict, List, Literal, Optional, Union +import bigframes_vendored.sklearn.ensemble._forest +import bigframes_vendored.xgboost.sklearn from google.cloud import bigquery import bigframes from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.ensemble._forest -import third_party.bigframes_vendored.xgboost.sklearn _BQML_PARAMS_MAPPING = { "booster": "boosterType", @@ -51,9 +51,9 @@ @log_adapter.class_logger class XGBRegressor( base.SupervisedTrainablePredictor, - third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor, + bigframes_vendored.xgboost.sklearn.XGBRegressor, ): - __doc__ = third_party.bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__ + __doc__ = bigframes_vendored.xgboost.sklearn.XGBRegressor.__doc__ def __init__( self, @@ -208,10 +208,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBRegressor: @log_adapter.class_logger class XGBClassifier( base.SupervisedTrainablePredictor, - third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier, + bigframes_vendored.xgboost.sklearn.XGBClassifier, ): - __doc__ = third_party.bigframes_vendored.xgboost.sklearn.XGBClassifier.__doc__ + __doc__ = bigframes_vendored.xgboost.sklearn.XGBClassifier.__doc__ def __init__( self, @@ -364,12 +364,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> XGBClassifier: @log_adapter.class_logger class RandomForestRegressor( base.SupervisedTrainablePredictor, - third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, + bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor, ): - __doc__ = ( - third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.ensemble._forest.RandomForestRegressor.__doc__ def __init__( self, @@ -531,12 +529,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> RandomForestRegresso @log_adapter.class_logger class RandomForestClassifier( base.SupervisedTrainablePredictor, - third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, + bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier, ): - __doc__ = ( - third_party.bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.ensemble._forest.RandomForestClassifier.__doc__ def __init__( self, diff --git a/bigframes/ml/linear_model.py b/bigframes/ml/linear_model.py index b0c4069352..68d1e12676 100644 --- a/bigframes/ml/linear_model.py +++ b/bigframes/ml/linear_model.py @@ -19,6 +19,8 @@ from typing import Dict, List, Literal, Optional, Union +import bigframes_vendored.sklearn.linear_model._base +import bigframes_vendored.sklearn.linear_model._logistic from google.cloud import bigquery import bigframes @@ -26,8 +28,6 @@ from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.linear_model._base -import third_party.bigframes_vendored.sklearn.linear_model._logistic _BQML_PARAMS_MAPPING = { "optimize_strategy": "optimizationStrategy", @@ -50,11 +50,9 @@ @log_adapter.class_logger class LinearRegression( base.SupervisedTrainablePredictor, - third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression, + bigframes_vendored.sklearn.linear_model._base.LinearRegression, ): - __doc__ = ( - third_party.bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.linear_model._base.LinearRegression.__doc__ def __init__( self, @@ -184,10 +182,10 @@ def to_gbq(self, model_name: str, replace: bool = False) -> LinearRegression: @log_adapter.class_logger class LogisticRegression( base.SupervisedTrainablePredictor, - third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, + bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression, ): __doc__ = ( - third_party.bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression.__doc__ + bigframes_vendored.sklearn.linear_model._logistic.LogisticRegression.__doc__ ) # TODO(ashleyxu) support class_weights in the constructor. diff --git a/bigframes/ml/metrics/_metrics.py b/bigframes/ml/metrics/_metrics.py index 5c81f16e31..e8c7400f35 100644 --- a/bigframes/ml/metrics/_metrics.py +++ b/bigframes/ml/metrics/_metrics.py @@ -19,6 +19,9 @@ import typing from typing import Tuple, Union +import bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification +import bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking +import bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression import numpy as np import pandas as pd import sklearn.metrics as sklearn_metrics # type: ignore @@ -26,9 +29,6 @@ import bigframes.constants as constants from bigframes.ml import utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.metrics._classification as vendored_mertics_classification -import third_party.bigframes_vendored.sklearn.metrics._ranking as vendored_mertics_ranking -import third_party.bigframes_vendored.sklearn.metrics._regression as vendored_metrics_regression def r2_score( diff --git a/bigframes/ml/metrics/pairwise.py b/bigframes/ml/metrics/pairwise.py index ef2c08d471..bdbe4a682d 100644 --- a/bigframes/ml/metrics/pairwise.py +++ b/bigframes/ml/metrics/pairwise.py @@ -15,9 +15,10 @@ import inspect from typing import Union +import bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise + from bigframes.ml import core, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.metrics.pairwise as vendored_metrics_pairwise def paired_cosine_distances( diff --git a/bigframes/ml/pipeline.py b/bigframes/ml/pipeline.py index 4ae2bfe555..9289b613b8 100644 --- a/bigframes/ml/pipeline.py +++ b/bigframes/ml/pipeline.py @@ -20,6 +20,7 @@ from typing import cast, List, Optional, Tuple, Union +import bigframes_vendored.sklearn.pipeline from google.cloud import bigquery import bigframes @@ -27,15 +28,14 @@ from bigframes.core import log_adapter from bigframes.ml import base, compose, forecasting, loader, preprocessing, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.pipeline @log_adapter.class_logger class Pipeline( base.BaseEstimator, - third_party.bigframes_vendored.sklearn.pipeline.Pipeline, + bigframes_vendored.sklearn.pipeline.Pipeline, ): - __doc__ = third_party.bigframes_vendored.sklearn.pipeline.Pipeline.__doc__ + __doc__ = bigframes_vendored.sklearn.pipeline.Pipeline.__doc__ def __init__(self, steps: List[Tuple[str, base.BaseEstimator]]): self.steps = steps diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index a403e57e71..23eab42978 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -20,23 +20,22 @@ import typing from typing import Any, cast, List, Literal, Optional, Tuple, Union +import bigframes_vendored.sklearn.preprocessing._data +import bigframes_vendored.sklearn.preprocessing._discretization +import bigframes_vendored.sklearn.preprocessing._encoder +import bigframes_vendored.sklearn.preprocessing._label + from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils import bigframes.pandas as bpd -import third_party.bigframes_vendored.sklearn.preprocessing._data -import third_party.bigframes_vendored.sklearn.preprocessing._discretization -import third_party.bigframes_vendored.sklearn.preprocessing._encoder -import third_party.bigframes_vendored.sklearn.preprocessing._label @log_adapter.class_logger class StandardScaler( base.Transformer, - third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, + bigframes_vendored.sklearn.preprocessing._data.StandardScaler, ): - __doc__ = ( - third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.preprocessing._data.StandardScaler.__doc__ def __init__(self): self._bqml_model: Optional[core.BqmlModel] = None @@ -116,11 +115,9 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @log_adapter.class_logger class MaxAbsScaler( base.Transformer, - third_party.bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler, + bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler, ): - __doc__ = ( - third_party.bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.preprocessing._data.MaxAbsScaler.__doc__ def __init__(self): self._bqml_model: Optional[core.BqmlModel] = None @@ -200,11 +197,9 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @log_adapter.class_logger class MinMaxScaler( base.Transformer, - third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler, + bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler, ): - __doc__ = ( - third_party.bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.preprocessing._data.MinMaxScaler.__doc__ def __init__(self): self._bqml_model: Optional[core.BqmlModel] = None @@ -284,10 +279,10 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @log_adapter.class_logger class KBinsDiscretizer( base.Transformer, - third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer, + bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer, ): __doc__ = ( - third_party.bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer.__doc__ + bigframes_vendored.sklearn.preprocessing._discretization.KBinsDiscretizer.__doc__ ) def __init__( @@ -403,15 +398,13 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @log_adapter.class_logger class OneHotEncoder( base.Transformer, - third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, + bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax TOP_K_DEFAULT = 1000000 FREQUENCY_THRESHOLD_DEFAULT = 0 - __doc__ = ( - third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder.__doc__ # All estimators must implement __init__ to document their parameters, even # if they don't have any @@ -533,15 +526,13 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: @log_adapter.class_logger class LabelEncoder( base.LabelTransformer, - third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, + bigframes_vendored.sklearn.preprocessing._label.LabelEncoder, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax TOP_K_DEFAULT = 1000000 FREQUENCY_THRESHOLD_DEFAULT = 0 - __doc__ = ( - third_party.bigframes_vendored.sklearn.preprocessing._label.LabelEncoder.__doc__ - ) + __doc__ = bigframes_vendored.sklearn.preprocessing._label.LabelEncoder.__doc__ # All estimators must implement __init__ to document their parameters, even # if they don't have any diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 154247c033..9bfa0500b5 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -16,6 +16,7 @@ import typing +import bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing import pandas as pd import bigframes.constants as constants @@ -28,7 +29,6 @@ import bigframes.operations.aggregations as agg_ops import bigframes.series as series import bigframes.session -import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing class SeriesMethods: diff --git a/bigframes/operations/datetimes.py b/bigframes/operations/datetimes.py index 3165e6f003..66ec347add 100644 --- a/bigframes/operations/datetimes.py +++ b/bigframes/operations/datetimes.py @@ -17,11 +17,12 @@ import datetime as dt from typing import Optional +import bigframes_vendored.pandas.core.indexes.accessor as vendordt + from bigframes.core import log_adapter import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series -import third_party.bigframes_vendored.pandas.core.indexes.accessor as vendordt @log_adapter.class_logger diff --git a/bigframes/operations/plotting.py b/bigframes/operations/plotting.py index ef36e9383a..d19485e65e 100644 --- a/bigframes/operations/plotting.py +++ b/bigframes/operations/plotting.py @@ -14,9 +14,10 @@ from typing import Optional, Sequence +import bigframes_vendored.pandas.plotting._core as vendordt + import bigframes.constants as constants import bigframes.operations._matplotlib as bfplt -import third_party.bigframes_vendored.pandas.plotting._core as vendordt class PlotAccessor: diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 2798f18b38..abd45a1453 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -17,13 +17,14 @@ import re from typing import cast, Literal, Optional, Union +import bigframes_vendored.pandas.core.strings.accessor as vendorstr + import bigframes.constants as constants from bigframes.core import log_adapter import bigframes.dataframe as df import bigframes.operations as ops import bigframes.operations.base import bigframes.series as series -import third_party.bigframes_vendored.pandas.core.strings.accessor as vendorstr # Maps from python to re2 REGEXP_FLAGS = { diff --git a/bigframes/operations/structs.py b/bigframes/operations/structs.py index 0e00b781c9..e8a1af9602 100644 --- a/bigframes/operations/structs.py +++ b/bigframes/operations/structs.py @@ -14,12 +14,13 @@ from __future__ import annotations +import bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors + from bigframes.core import log_adapter import bigframes.dataframe import bigframes.operations import bigframes.operations.base import bigframes.series -import third_party.bigframes_vendored.pandas.core.arrays.arrow.accessors as vendoracessors @log_adapter.class_logger diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 195d7eabfa..03c8412907 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -36,6 +36,12 @@ Union, ) +import bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat +import bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding +import bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge +import bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile +import bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes +import bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq from google.cloud import bigquery import numpy import pandas @@ -59,12 +65,6 @@ import bigframes.series import bigframes.session import bigframes.session.clients -import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat -import third_party.bigframes_vendored.pandas.core.reshape.encoding as vendored_pandas_encoding -import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge -import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile -import third_party.bigframes_vendored.pandas.core.tools.datetimes as vendored_pandas_datetimes -import third_party.bigframes_vendored.pandas.io.gbq as vendored_pandas_gbq # Include method definition so that the method appears in our docs for diff --git a/bigframes/py.typed b/bigframes/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/bigframes/series.py b/bigframes/series.py index 21f1f3b4e4..ef2feb4f92 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -23,6 +23,7 @@ import typing from typing import Any, Mapping, Optional, Tuple, Union +import bigframes_vendored.pandas.core.series as vendored_pandas_series import google.cloud.bigquery as bigquery import numpy import pandas @@ -53,7 +54,6 @@ import bigframes.operations.plotting as plotting import bigframes.operations.strings as strings import bigframes.operations.structs as structs -import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series LevelType = typing.Union[str, int] LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index b553865ea9..e3c392cd2f 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -39,6 +39,14 @@ ) import warnings +# Even though the ibis.backends.bigquery import is unused, it's needed +# to register new and replacement ops with the Ibis BigQuery backend. +import bigframes_vendored.ibis.backends.bigquery # noqa +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops +import bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq +import bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet +import bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers +import bigframes_vendored.pandas.io.pickle as third_party_pandas_pickle import google.api_core.client_info import google.api_core.client_options import google.api_core.exceptions @@ -81,14 +89,6 @@ import bigframes.session.clients import bigframes.version -# Even though the ibis.backends.bigquery import is unused, it's needed -# to register new and replacement ops with the Ibis BigQuery backend. -import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops -import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq -import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet -import third_party.bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers -import third_party.bigframes_vendored.pandas.io.pickle as third_party_pandas_pickle - _BIGFRAMES_DEFAULT_CONNECTION_ID = "bigframes-default-connection" _MAX_CLUSTER_COLUMNS = 4 diff --git a/noxfile.py b/noxfile.py index db503c43fd..e7f238c01f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -219,7 +219,10 @@ def unit_noextras(session): @nox.session(python=DEFAULT_PYTHON_VERSION) def mypy(session): """Run type checks with mypy.""" - session.install("-e", ".") + # Editable mode is not compatible with mypy when there are multiple + # package directories. See: + # https://github.com/python/mypy/issues/10564#issuecomment-851687749 + session.install(".") # Just install the dependencies' type info directly, since "mypy --install-types" # might require an additional pass. diff --git a/owlbot.py b/owlbot.py index dc84de7d8f..4dc6d1aca3 100644 --- a/owlbot.py +++ b/owlbot.py @@ -63,7 +63,7 @@ s.replace( ["MANIFEST.in"], re.escape("recursive-include google"), - "recursive-include third_party *\nrecursive-include bigframes", + "recursive-include third_party/bigframes_vendored *\nrecursive-include bigframes", ) # Even though BigQuery DataFrames isn't technically a client library, we are diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..fed528d4a7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 027c1b76af..a626fd4b34 100644 --- a/setup.py +++ b/setup.py @@ -88,7 +88,11 @@ packages = [ package for package in setuptools.find_namespace_packages() - if package.startswith("bigframes") or package.startswith("third_party") + if package.startswith("bigframes") +] + [ + package + for package in setuptools.find_namespace_packages("third_party") + if package.startswith("bigframes_vendored") ] setuptools.setup( @@ -115,6 +119,10 @@ install_requires=dependencies, extras_require=extras, platforms="Posix; MacOS X; Windows", + package_dir={ + "bigframes": "bigframes", + "bigframes_vendored": "third_party/bigframes_vendored", + }, packages=packages, python_requires=">=3.9", include_package_data=True, diff --git a/tests/system/small/test_ibis.py b/tests/system/small/test_ibis.py index 9fe1176068..e2648d1eba 100644 --- a/tests/system/small/test_ibis.py +++ b/tests/system/small/test_ibis.py @@ -14,10 +14,10 @@ """Tests for monkeypatched ibis code.""" +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops import ibis.expr.types as ibis_types import bigframes -import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops def test_approximate_quantiles(session: bigframes.Session, scalars_table_id: str): diff --git a/tests/unit/test_remote_function.py b/tests/unit/test_remote_function.py index 629bc5326a..1acff27c7f 100644 --- a/tests/unit/test_remote_function.py +++ b/tests/unit/test_remote_function.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes from ibis.expr import datatypes as ibis_types import bigframes.dtypes -import third_party.bigframes_vendored.ibis.backends.bigquery.datatypes as third_party_ibis_bqtypes def test_supported_types_correspond(): diff --git a/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py b/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py index dc4a09cc54..c798b0d169 100644 --- a/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py +++ b/third_party/bigframes_vendored/google_cloud_bigquery/tests/unit/test_pandas_helpers.py @@ -25,7 +25,7 @@ @pytest.fixture def module_under_test(): - from third_party.bigframes_vendored.google_cloud_bigquery import _pandas_helpers + from bigframes_vendored.google_cloud_bigquery import _pandas_helpers return _pandas_helpers diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py index 43508fab11..1d2d05a741 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/__init__.py @@ -1,3 +1,3 @@ # Import all sub-modules to monkeypatch everything. -import third_party.bigframes_vendored.ibis.backends.bigquery.compiler # noqa -import third_party.bigframes_vendored.ibis.backends.bigquery.registry # noqa +import bigframes_vendored.ibis.backends.bigquery.compiler # noqa +import bigframes_vendored.ibis.backends.bigquery.registry # noqa diff --git a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py index e1b28690d7..3f89feaa34 100644 --- a/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py +++ b/third_party/bigframes_vendored/ibis/backends/bigquery/registry.py @@ -1,10 +1,9 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/backends/bigquery/registry.py """Module to convert from Ibis expression to SQL string.""" +import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops from ibis.backends.bigquery.registry import OPERATION_REGISTRY -import third_party.bigframes_vendored.ibis.expr.operations as vendored_ibis_ops - def _approx_quantiles(translator, op: vendored_ibis_ops.ApproximateMultiQuantile): arg = translator.translate(op.arg) diff --git a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py index 8219701392..2c2efe528d 100644 --- a/third_party/bigframes_vendored/ibis/expr/operations/__init__.py +++ b/third_party/bigframes_vendored/ibis/expr/operations/__init__.py @@ -1,6 +1,6 @@ # Contains code from https://github.com/ibis-project/ibis/blob/master/ibis/expr/operations/__init__.py from __future__ import annotations -from third_party.bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 -from third_party.bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 -from third_party.bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 +from bigframes_vendored.ibis.expr.operations.analytic import * # noqa: F401 F403 +from bigframes_vendored.ibis.expr.operations.json import * # noqa: F401 F403 +from bigframes_vendored.ibis.expr.operations.reductions import * # noqa: F401 F403 diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 0399d9c5b9..313c6663c8 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -13,11 +13,11 @@ from typing import Hashable, Iterable, Literal, Mapping, Optional, Sequence, Union +from bigframes_vendored.pandas.core.generic import NDFrame import numpy as np import pandas as pd from bigframes import constants -from third_party.bigframes_vendored.pandas.core.generic import NDFrame # ----------------------------------------------------------------------- # DataFrame class diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index b55c7e23d8..01d8f7a174 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -3,8 +3,9 @@ from typing import Iterator, Literal, Optional +from bigframes_vendored.pandas.core import indexing + from bigframes import constants -from third_party.bigframes_vendored.pandas.core import indexing class NDFrame(indexing.IndexingMixin): diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 2c4f2aaa8f..beaf8aedb1 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -5,16 +5,16 @@ from typing import Hashable, IO, Literal, Mapping, Sequence, TYPE_CHECKING +from bigframes_vendored.pandas.core.generic import NDFrame import numpy as np from pandas._libs import lib from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer from bigframes import constants -from third_party.bigframes_vendored.pandas.core.generic import NDFrame if TYPE_CHECKING: - from third_party.bigframes_vendored.pandas.core.frame import DataFrame - from third_party.bigframes_vendored.pandas.core.groupby import SeriesGroupBy + from bigframes_vendored.pandas.core.frame import DataFrame + from bigframes_vendored.pandas.core.groupby import SeriesGroupBy class Series(NDFrame): # type: ignore[misc] diff --git a/third_party/bigframes_vendored/py.typed b/third_party/bigframes_vendored/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index be6c5e7c52..d72b9b7bd5 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -13,8 +13,9 @@ from abc import ABC +from bigframes_vendored.sklearn.base import BaseEstimator + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator class _BaseKMeans(BaseEstimator, ABC): diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index dead173b2d..b08eb10492 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -6,8 +6,9 @@ from abc import ABCMeta +from bigframes_vendored.sklearn.base import BaseEstimator + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator class _BaseComposition(BaseEstimator, metaclass=ABCMeta): diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 011ecc06dd..30c9c3b0b6 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -12,8 +12,9 @@ from abc import ABCMeta +from bigframes_vendored.sklearn.base import BaseEstimator + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator class PCA(BaseEstimator, metaclass=ABCMeta): diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index ab946e5861..ad2c872468 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -17,13 +17,14 @@ from abc import ABCMeta -from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import ( +from bigframes_vendored.sklearn.base import ( BaseEstimator, ClassifierMixin, RegressorMixin, ) +from bigframes import constants + class LinearModel(BaseEstimator, metaclass=ABCMeta): def predict(self, X): diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 989ca03c82..621c78d551 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -12,14 +12,13 @@ # Original location: https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/linear_model/_logistic.py -from typing import List, Optional - -from bigframes import constants -from third_party.bigframes_vendored.sklearn.linear_model._base import ( +from bigframes_vendored.sklearn.linear_model._base import ( BaseEstimator, LinearClassifierMixin, ) +from bigframes import constants + class LogisticRegression(LinearClassifierMixin, BaseEstimator): """Logistic Regression (aka logit, MaxEnt) classifier. diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index 4b8eb25a97..aed1565960 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -11,8 +11,9 @@ from abc import ABCMeta +from bigframes_vendored.sklearn.base import BaseEstimator + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator class Pipeline(BaseEstimator, metaclass=ABCMeta): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index 5ce102d573..1ff83aa640 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -7,8 +7,9 @@ # Eric Chang # License: BSD 3 clause +from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin class StandardScaler(BaseEstimator, TransformerMixin): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py index 0236558dd4..5fcc481573 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_discretization.py @@ -3,8 +3,9 @@ # License: BSD +from bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator, TransformerMixin class KBinsDiscretizer(TransformerMixin, BaseEstimator): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index 8da9a98c53..5e5e8ac042 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -2,8 +2,9 @@ # Joris Van den Bossche # License: BSD 3 clause +from bigframes_vendored.sklearn.base import BaseEstimator + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator class OneHotEncoder(BaseEstimator): diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py index 83f8eb0f9c..cc6b995c8c 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_label.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_label.py @@ -6,8 +6,9 @@ # Hamzeh Alsalhi # License: BSD 3 clause +from bigframes_vendored.sklearn.base import BaseEstimator + from bigframes import constants -from third_party.bigframes_vendored.sklearn.base import BaseEstimator class LabelEncoder(BaseEstimator): From ae0e3eaca49171fd449de4d43ddc3e3ce9fdc2ce Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 12 Mar 2024 11:50:16 -0700 Subject: [PATCH 17/21] fix: read_pandas inline respects location (#412) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes b/327544164 🦕 --- bigframes/core/__init__.py | 4 ++-- bigframes/core/blocks.py | 4 ++-- bigframes/core/nodes.py | 1 + bigframes/dataframe.py | 2 +- bigframes/session/__init__.py | 2 +- notebooks/location/regionalized.ipynb | 2 +- tests/system/small/test_dataframe.py | 17 +++++++++++++++++ tests/system/small/test_session.py | 11 +++++++++++ tests/unit/core/test_blocks.py | 9 ++++++++- 9 files changed, 44 insertions(+), 8 deletions(-) diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9032993452..e4a60e08e1 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -69,7 +69,7 @@ def from_ibis( return cls(node) @classmethod - def from_pandas(cls, pd_df: pandas.DataFrame): + def from_pandas(cls, pd_df: pandas.DataFrame, session: bigframes.Session): iobytes = io.BytesIO() # Use alphanumeric identifiers, to avoid downstream problems with escaping. as_ids = [ @@ -78,7 +78,7 @@ def from_pandas(cls, pd_df: pandas.DataFrame): ] unique_ids = tuple(bigframes.core.utils.disambiguate_ids(as_ids)) pd_df.reset_index(drop=True).set_axis(unique_ids, axis=1).to_feather(iobytes) - node = nodes.ReadLocalNode(iobytes.getvalue()) + node = nodes.ReadLocalNode(feather_bytes=iobytes.getvalue(), session=session) return cls(node) @property diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 93dcd1d691..375ce7e7e0 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -140,7 +140,7 @@ def __init__( self._stats_cache[" ".join(self.index_columns)] = {} @classmethod - def from_local(cls, data) -> Block: + def from_local(cls, data, session: bigframes.Session) -> Block: pd_data = pd.DataFrame(data) columns = pd_data.columns @@ -162,7 +162,7 @@ def from_local(cls, data) -> Block: ) index_ids = pd_data.columns[: len(index_labels)] - keys_expr = core.ArrayValue.from_pandas(pd_data) + keys_expr = core.ArrayValue.from_pandas(pd_data, session) return cls( keys_expr, column_labels=columns, diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 1cd3277cbc..9da535e15f 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -155,6 +155,7 @@ def __hash__(self): @dataclass(frozen=True) class ReadLocalNode(BigFrameNode): feather_bytes: bytes + session: typing.Optional[bigframes.session.Session] = None def __hash__(self): return self._node_hash diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6ed882987c..5dae7a82f9 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -1646,7 +1646,7 @@ def _reindex_rows( raise NotImplementedError( "Cannot reindex with index with different nlevels" ) - new_indexer = DataFrame(index=index)[[]] + new_indexer = DataFrame(index=index, session=self._session)[[]] # multiindex join is senstive to index names, so we will set all these result = new_indexer.rename_axis(range(new_indexer.index.nlevels)).join( self.rename_axis(range(self.index.nlevels)), diff --git a/bigframes/session/__init__.py b/bigframes/session/__init__.py index e3c392cd2f..5266267a22 100644 --- a/bigframes/session/__init__.py +++ b/bigframes/session/__init__.py @@ -916,7 +916,7 @@ def _read_pandas( def _read_pandas_inline( self, pandas_dataframe: pandas.DataFrame ) -> dataframe.DataFrame: - return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe)) + return dataframe.DataFrame(blocks.Block.from_local(pandas_dataframe, self)) def _read_pandas_load_job( self, pandas_dataframe: pandas.DataFrame, api_name: str diff --git a/notebooks/location/regionalized.ipynb b/notebooks/location/regionalized.ipynb index a7ff5db84e..86f43b1dd6 100644 --- a/notebooks/location/regionalized.ipynb +++ b/notebooks/location/regionalized.ipynb @@ -2791,7 +2791,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.9" }, "orig_nbformat": 4 }, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 9f4e138b73..61dcd778ef 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -93,6 +93,23 @@ def test_df_construct_from_dict(): ) +def test_df_construct_inline_respects_location(): + import bigframes.pandas as bpd + + bpd.close_session() + bpd.options.bigquery.location = "europe-west1" + + df = bpd.DataFrame([[1, 2, 3], [4, 5, 6]]) + repr(df) + + table = bpd.get_global_session().bqclient.get_table(df.query_job.destination) + assert table.location == "europe-west1" + + # Reset global session + bpd.close_session() + bpd.options.bigquery.location = "us" + + def test_get_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2e2252be06..aba4a52c43 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -369,6 +369,17 @@ def test_read_pandas(session, scalars_dfs): pd.testing.assert_frame_equal(result, expected) +def test_read_pandas_inline_respects_location(): + options = bigframes.BigQueryOptions(location="europe-west1") + session = bigframes.Session(options) + + df = session.read_pandas(pd.DataFrame([[1, 2, 3], [4, 5, 6]])) + repr(df) + + table = session.bqclient.get_table(df.query_job.destination) + assert table.location == "europe-west1" + + def test_read_pandas_col_label_w_space(session: bigframes.Session): expected = pd.DataFrame( { diff --git a/tests/unit/core/test_blocks.py b/tests/unit/core/test_blocks.py index 5a4f0951d3..0bb5e0101a 100644 --- a/tests/unit/core/test_blocks.py +++ b/tests/unit/core/test_blocks.py @@ -12,10 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pandas import pandas.testing import pytest +import bigframes import bigframes.core.blocks as blocks @@ -74,8 +77,12 @@ ) def test_block_from_local(data): expected = pandas.DataFrame(data) + mock_session = mock.create_autospec(spec=bigframes.Session) + + # hard-coded the returned dimension of the session for that each of the test case contains 3 rows. + mock_session._execute.return_value = (iter([[3]]), None) - block = blocks.Block.from_local(data) + block = blocks.Block.from_local(data, mock_session) pandas.testing.assert_index_equal(block.column_labels, expected.columns) assert tuple(block.index.names) == tuple(expected.index.names) From 8d8294544ac7fedaca753c5473e3ca2a27868420 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 12 Mar 2024 12:33:08 -0700 Subject: [PATCH 18/21] feat: add ml PCA.detect_anomalies method (#422) * feat: add ml detect_anomalies * add PCA.detect_anomalies * fix mypy --- bigframes/ml/core.py | 18 +++++++++---- bigframes/ml/decomposition.py | 28 +++++++++++++++++++ bigframes/ml/imported.py | 5 ++-- bigframes/ml/remote.py | 5 ++-- bigframes/ml/sql.py | 8 ++++++ tests/system/small/ml/test_core.py | 30 +++++++++++++++++++-- tests/system/small/ml/test_decomposition.py | 26 +++++++++++++++++- tests/unit/ml/test_sql.py | 20 ++++++++++++-- 8 files changed, 124 insertions(+), 16 deletions(-) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 24997708fb..43a882ecac 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -128,14 +128,12 @@ def model(self) -> bigquery.Model: return self._model def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: - # TODO: validate input data schema return self._apply_sql( input_data, self._model_manipulation_sql_generator.ml_predict, ) def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: - # TODO: validate input data schema return self._apply_sql( input_data, self._model_manipulation_sql_generator.ml_transform, @@ -146,7 +144,6 @@ def generate_text( input_data: bpd.DataFrame, options: Mapping[str, int | float], ) -> bpd.DataFrame: - # TODO: validate input data schema return self._apply_sql( input_data, lambda source_df: self._model_manipulation_sql_generator.ml_generate_text( @@ -160,7 +157,6 @@ def generate_text_embedding( input_data: bpd.DataFrame, options: Mapping[str, int | float], ) -> bpd.DataFrame: - # TODO: validate input data schema return self._apply_sql( input_data, lambda source_df: self._model_manipulation_sql_generator.ml_generate_text_embedding( @@ -169,12 +165,24 @@ def generate_text_embedding( ), ) + def detect_anomalies( + self, input_data: bpd.DataFrame, options: Mapping[str, int | float] + ) -> bpd.DataFrame: + assert self._model.model_type in ("PCA", "KMEANS", "ARIMA_PLUS") + + return self._apply_sql( + input_data, + lambda source_df: self._model_manipulation_sql_generator.ml_detect_anomalies( + source_df=source_df, + struct_options=options, + ), + ) + def forecast(self, options: Mapping[str, int | float]) -> bpd.DataFrame: sql = self._model_manipulation_sql_generator.ml_forecast(struct_options=options) return self._session.read_gbq(sql, index_col="forecast_timestamp").reset_index() def evaluate(self, input_data: Optional[bpd.DataFrame] = None): - # TODO: validate input data schema sql = self._model_manipulation_sql_generator.ml_evaluate(input_data) return self._session.read_gbq(sql) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index f2b7c97994..2714664dce 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -110,6 +110,34 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: return self._bqml_model.predict(X) + def detect_anomalies( + self, X: Union[bpd.DataFrame, bpd.Series], *, contamination=0.1 + ) -> bpd.DataFrame: + """Detect the anomaly data points of the input. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or a DataFrame to detect anomalies. + contamination (float, default 0.1): + Identifies the proportion of anomalies in the training dataset that are used to create the model. + The value must be in the range [0, 0.5]. + + Returns: + bigframes.dataframe.DataFrame: detected DataFrame.""" + if contamination < 0.0 or contamination > 0.5: + raise ValueError( + f"contamination must be [0.0, 0.5], but is {contamination}." + ) + + if not self._bqml_model: + raise RuntimeError("A model must be fitted before detect_anomalies") + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.detect_anomalies( + X, options={"contamination": contamination} + ) + def to_gbq(self, model_name: str, replace: bool = False) -> PCA: """Save the model to BigQuery. diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 98b23931f3..7f75827083 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -23,7 +23,6 @@ import bigframes from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils -from bigframes.ml.globals import _SUPPORTED_DTYPES import bigframes.pandas as bpd @@ -236,9 +235,9 @@ def _create_bqml_model(self): else: for io in (self.input, self.output): for v in io.values(): - if v not in _SUPPORTED_DTYPES: + if v not in globals._SUPPORTED_DTYPES: raise ValueError( - f"field_type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}." + f"field_type {v} is not supported. We only support {', '.join(globals._SUPPORTED_DTYPES)}." ) return self._bqml_model_factory.create_xgboost_imported_model( diff --git a/bigframes/ml/remote.py b/bigframes/ml/remote.py index a4a95b39d1..2b83382e68 100644 --- a/bigframes/ml/remote.py +++ b/bigframes/ml/remote.py @@ -23,7 +23,6 @@ from bigframes import clients from bigframes.core import log_adapter from bigframes.ml import base, core, globals, utils -from bigframes.ml.globals import _SUPPORTED_DTYPES import bigframes.pandas as bpd _REMOTE_MODEL_STATUS = "remote_model_status" @@ -102,9 +101,9 @@ def standardize_type(v: str): v = v.lower() v = v.replace("boolean", "bool") - if v not in _SUPPORTED_DTYPES: + if v not in globals._SUPPORTED_DTYPES: raise ValueError( - f"Data type {v} is not supported. We only support {', '.join(_SUPPORTED_DTYPES)}." + f"Data type {v} is not supported. We only support {', '.join(globals._SUPPORTED_DTYPES)}." ) return v diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 7999cb90a3..fa74458e77 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -276,6 +276,14 @@ def ml_generate_text_embedding( return f"""SELECT * FROM ML.GENERATE_TEXT_EMBEDDING(MODEL `{self._model_name}`, ({self._source_sql(source_df)}), {struct_options_sql})""" + def ml_detect_anomalies( + self, source_df: bpd.DataFrame, struct_options: Mapping[str, Union[int, float]] + ) -> str: + """Encode ML.DETECT_ANOMALIES for BQML""" + struct_options_sql = self.struct_options(**struct_options) + return f"""SELECT * FROM ML.DETECT_ANOMALIES(MODEL `{self._model_name}`, + {struct_options_sql}, ({self._source_sql(source_df)}))""" + # ML evaluation TVFs def ml_evaluate(self, source_df: Optional[bpd.DataFrame] = None) -> str: """Encode ML.EVALUATE for BQML""" diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index d20867a2d7..02030cd31e 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -289,6 +289,29 @@ def test_model_predict_with_unnamed_index( ) +def test_model_detect_anomalies( + penguins_bqml_pca_model: core.BqmlModel, new_penguins_df +): + options = {"contamination": 0.25} + anomalies = penguins_bqml_pca_model.detect_anomalies( + new_penguins_df, options + ).to_pandas() + expected = pd.DataFrame( + { + "is_anomaly": [True, True, True], + "mean_squared_error": [0.254188, 0.731243, 0.298889], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "mean_squared_error"]].sort_index(), + expected, + check_exact=False, + check_dtype=False, + rtol=0.1, + ) + + def test_remote_model_predict( bqml_linear_remote_model: core.BqmlModel, new_penguins_df ): @@ -367,16 +390,19 @@ def test_model_forecast(time_series_bqml_arima_plus_model: core.BqmlModel): ) -def test_model_register(ephemera_penguins_bqml_linear_model): +def test_model_register(ephemera_penguins_bqml_linear_model: core.BqmlModel): model = ephemera_penguins_bqml_linear_model model.register() + assert model.model.model_id is not None model_name = "bigframes_" + model.model.model_id # Only registered model contains the field, and the field includes project/dataset. Here only check model_id. assert model_name in model.model.training_runs[-1]["vertexAiModelId"] -def test_model_register_with_params(ephemera_penguins_bqml_linear_model): +def test_model_register_with_params( + ephemera_penguins_bqml_linear_model: core.BqmlModel, +): model_name = "bigframes_system_test_model" model = ephemera_penguins_bqml_linear_model model.register(model_name) diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 9565b8f7a8..72fdc6d951 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -15,10 +15,13 @@ import pandas as pd from bigframes.ml import decomposition +import bigframes.pandas as bpd import tests.system.utils -def test_pca_predict(penguins_pca_model, new_penguins_df): +def test_pca_predict( + penguins_pca_model: decomposition.PCA, new_penguins_df: bpd.DataFrame +): predictions = penguins_pca_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( { @@ -35,6 +38,27 @@ def test_pca_predict(penguins_pca_model, new_penguins_df): ) +def test_pca_detect_anomalies( + penguins_pca_model: decomposition.PCA, new_penguins_df: bpd.DataFrame +): + anomalies = penguins_pca_model.detect_anomalies(new_penguins_df).to_pandas() + expected = pd.DataFrame( + { + "is_anomaly": [False, True, False], + "mean_squared_error": [0.254188, 0.731243, 0.298889], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "mean_squared_error"]].sort_index(), + expected, + check_exact=False, + check_dtype=False, + rtol=0.1, + ) + + def test_pca_score(penguins_pca_model: decomposition.PCA): result = penguins_pca_model.score().to_pandas() expected = pd.DataFrame( diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index 52c10f3144..913bab0379 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -341,9 +341,8 @@ def test_ml_centroids_correct( ) -def test_forecast_correct_sql( +def test_ml_forecast_correct_sql( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, - mock_df: bpd.DataFrame, ): sql = model_manipulation_sql_generator.ml_forecast( struct_options={"option_key1": 1, "option_key2": 2.2}, @@ -391,6 +390,23 @@ def test_ml_generate_text_embedding_correct( ) +def test_ml_detect_anomalies_correct_sql( + model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, + mock_df: bpd.DataFrame, +): + sql = model_manipulation_sql_generator.ml_detect_anomalies( + source_df=mock_df, + struct_options={"option_key1": 1, "option_key2": 2.2}, + ) + assert ( + sql + == """SELECT * FROM ML.DETECT_ANOMALIES(MODEL `my_project_id.my_dataset_id.my_model_id`, + STRUCT( + 1 AS option_key1, + 2.2 AS option_key2), (input_X_sql))""" + ) + + def test_ml_principal_components_correct( model_manipulation_sql_generator: ml_sql.ModelManipulationSqlGenerator, ): From 6df28ed704552ebec7869e1f2034614cb6407098 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 12 Mar 2024 15:26:16 -0700 Subject: [PATCH 19/21] feat: add detect_anomalies to ml ARIMAPlus and KMeans models (#426) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes # 🦕 --- bigframes/ml/cluster.py | 28 +++++++++ bigframes/ml/decomposition.py | 2 +- bigframes/ml/forecasting.py | 30 +++++++++ tests/system/small/ml/test_cluster.py | 45 +++++++++++++ tests/system/small/ml/test_decomposition.py | 23 +++++++ tests/system/small/ml/test_forecasting.py | 70 +++++++++++++++++---- 6 files changed, 185 insertions(+), 13 deletions(-) diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 360ab01453..c294d1f424 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -96,6 +96,34 @@ def predict( return self._bqml_model.predict(X) + def detect_anomalies( + self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1 + ) -> bpd.DataFrame: + """Detect the anomaly data points of the input. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or a DataFrame to detect anomalies. + contamination (float, default 0.1): + Identifies the proportion of anomalies in the training dataset that are used to create the model. + The value must be in the range [0, 0.5]. + + Returns: + bigframes.dataframe.DataFrame: detected DataFrame.""" + if contamination < 0.0 or contamination > 0.5: + raise ValueError( + f"contamination must be [0.0, 0.5], but is {contamination}." + ) + + if not self._bqml_model: + raise RuntimeError("A model must be fitted before detect_anomalies") + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.detect_anomalies( + X, options={"contamination": contamination} + ) + def to_gbq(self, model_name: str, replace: bool = False) -> KMeans: """Save the model to BigQuery. diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 2714664dce..9dc60be78f 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -111,7 +111,7 @@ def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: return self._bqml_model.predict(X) def detect_anomalies( - self, X: Union[bpd.DataFrame, bpd.Series], *, contamination=0.1 + self, X: Union[bpd.DataFrame, bpd.Series], *, contamination: float = 0.1 ) -> bpd.DataFrame: """Detect the anomaly data points of the input. diff --git a/bigframes/ml/forecasting.py b/bigframes/ml/forecasting.py index 0c33660475..18380328c7 100644 --- a/bigframes/ml/forecasting.py +++ b/bigframes/ml/forecasting.py @@ -119,6 +119,36 @@ def predict( options={"horizon": horizon, "confidence_level": confidence_level} ) + def detect_anomalies( + self, + X: Union[bpd.DataFrame, bpd.Series], + *, + anomaly_prob_threshold: float = 0.95, + ) -> bpd.DataFrame: + """Detect the anomaly data points of the input. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or a DataFrame to detect anomalies. + anomaly_prob_threshold (float, default 0.95): + Identifies the custom threshold to use for anomaly detection. The value must be in the range [0, 1), with a default value of 0.95. + + Returns: + bigframes.dataframe.DataFrame: detected DataFrame.""" + if anomaly_prob_threshold < 0.0 or anomaly_prob_threshold >= 1.0: + raise ValueError( + f"anomaly_prob_threshold must be [0.0, 1.0), but is {anomaly_prob_threshold}." + ) + + if not self._bqml_model: + raise RuntimeError("A model must be fitted before detect_anomalies") + + (X,) = utils.convert_to_dataframe(X) + + return self._bqml_model.detect_anomalies( + X, options={"anomaly_prob_threshold": anomaly_prob_threshold} + ) + def score( self, X: Union[bpd.DataFrame, bpd.Series], diff --git a/tests/system/small/ml/test_cluster.py b/tests/system/small/ml/test_cluster.py index a9fec0bbce..96066e5fbe 100644 --- a/tests/system/small/ml/test_cluster.py +++ b/tests/system/small/ml/test_cluster.py @@ -15,6 +15,7 @@ import pandas as pd from bigframes.ml import cluster +import bigframes.pandas as bpd from tests.system.utils import assert_pandas_df_equal _PD_NEW_PENGUINS = pd.DataFrame.from_dict( @@ -73,6 +74,50 @@ def test_kmeans_predict(session, penguins_kmeans_model: cluster.KMeans): assert_pandas_df_equal(result, expected, ignore_order=True) +def test_kmeans_detect_anomalies( + penguins_kmeans_model: cluster.KMeans, new_penguins_df: bpd.DataFrame +): + anomalies = penguins_kmeans_model.detect_anomalies(new_penguins_df).to_pandas() + expected = pd.DataFrame( + { + "is_anomaly": [False, False, False], + "normalized_distance": [1.082937, 0.77139, 0.478304], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "normalized_distance"]].sort_index(), + expected, + check_exact=False, + check_dtype=False, + rtol=0.1, + ) + + +def test_kmeans_detect_anomalies_params( + penguins_kmeans_model: cluster.KMeans, new_penguins_df: bpd.DataFrame +): + anomalies = penguins_kmeans_model.detect_anomalies( + new_penguins_df, contamination=0.4 + ).to_pandas() + expected = pd.DataFrame( + { + "is_anomaly": [True, False, False], + "normalized_distance": [1.082937, 0.77139, 0.478304], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "normalized_distance"]].sort_index(), + expected, + check_exact=False, + check_dtype=False, + rtol=0.1, + ) + + def test_kmeans_score(session, penguins_kmeans_model: cluster.KMeans): new_penguins = session.read_pandas(_PD_NEW_PENGUINS) result = penguins_kmeans_model.score(new_penguins).to_pandas() diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 72fdc6d951..9eb9b25ea1 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -59,6 +59,29 @@ def test_pca_detect_anomalies( ) +def test_pca_detect_anomalies_params( + penguins_pca_model: decomposition.PCA, new_penguins_df: bpd.DataFrame +): + anomalies = penguins_pca_model.detect_anomalies( + new_penguins_df, contamination=0.2 + ).to_pandas() + expected = pd.DataFrame( + { + "is_anomaly": [False, True, True], + "mean_squared_error": [0.254188, 0.731243, 0.298889], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "mean_squared_error"]].sort_index(), + expected, + check_exact=False, + check_dtype=False, + rtol=0.1, + ) + + def test_pca_score(penguins_pca_model: decomposition.PCA): result = penguins_pca_model.score().to_pandas() expected = pd.DataFrame( diff --git a/tests/system/small/ml/test_forecasting.py b/tests/system/small/ml/test_forecasting.py index 4726d5ab21..7fef189550 100644 --- a/tests/system/small/ml/test_forecasting.py +++ b/tests/system/small/ml/test_forecasting.py @@ -35,7 +35,9 @@ ] -def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPlus): +def test_arima_plus_predict_default( + time_series_arima_plus_model: forecasting.ARIMAPlus, +): utc = pytz.utc predictions = time_series_arima_plus_model.predict().to_pandas() assert predictions.shape == (3, 8) @@ -63,7 +65,7 @@ def test_model_predict_default(time_series_arima_plus_model: forecasting.ARIMAPl ) -def test_model_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus): +def test_arima_plus_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlus): utc = pytz.utc predictions = time_series_arima_plus_model.predict( horizon=4, confidence_level=0.9 @@ -94,7 +96,55 @@ def test_model_predict_params(time_series_arima_plus_model: forecasting.ARIMAPlu ) -def test_model_score( +def test_arima_plus_detect_anomalies( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): + anomalies = time_series_arima_plus_model.detect_anomalies( + new_time_series_df + ).to_pandas() + + expected = pd.DataFrame( + { + "is_anomaly": [False, False, False], + "lower_bound": [2349.301736, 2153.614829, 1849.040192], + "upper_bound": [3099.642833, 3033.12195, 2858.185876], + "anomaly_probability": [0.757824, 0.322559, 0.43011], + }, + ) + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]], + expected, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) + + +def test_arima_plus_detect_anomalies_params( + time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +): + anomalies = time_series_arima_plus_model.detect_anomalies( + new_time_series_df, anomaly_prob_threshold=0.7 + ).to_pandas() + + expected = pd.DataFrame( + { + "is_anomaly": [True, False, False], + "lower_bound": [2525.5363, 2360.1870, 2086.0609], + "upper_bound": [2923.408256, 2826.54981, 2621.165188], + "anomaly_probability": [0.757824, 0.322559, 0.43011], + }, + ) + pd.testing.assert_frame_equal( + anomalies[["is_anomaly", "lower_bound", "upper_bound", "anomaly_probability"]], + expected, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) + + +def test_arima_plus_score( time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df ): result = time_series_arima_plus_model.score( @@ -118,16 +168,14 @@ def test_model_score( ) -def test_model_summary( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df -): +def test_arima_plus_summary(time_series_arima_plus_model: forecasting.ARIMAPlus): result = time_series_arima_plus_model.summary() assert result.shape == (1, 12) assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) -def test_model_summary_show_all_candidates( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df +def test_arima_plus_summary_show_all_candidates( + time_series_arima_plus_model: forecasting.ARIMAPlus, ): result = time_series_arima_plus_model.summary( show_all_candidate_models=True, @@ -136,7 +184,7 @@ def test_model_summary_show_all_candidates( assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) -def test_model_score_series( +def test_arima_plus_score_series( time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df ): result = time_series_arima_plus_model.score( @@ -160,9 +208,7 @@ def test_model_score_series( ) -def test_model_summary_series( - time_series_arima_plus_model: forecasting.ARIMAPlus, new_time_series_df -): +def test_arima_plus_summary_series(time_series_arima_plus_model: forecasting.ARIMAPlus): result = time_series_arima_plus_model.summary() assert result.shape == (1, 12) assert all(column in result.columns for column in ARIMA_EVALUATE_OUTPUT_COL) From a9a5e6af52f41c4662fba16ca1875855cea67d3a Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Tue, 12 Mar 2024 16:18:26 -0700 Subject: [PATCH 20/21] chore: update genai notebooks to Gemini (#429) --- .../bq_dataframes_llm_code_generation.ipynb | 531 ++++++++++- .../bq_dataframes_llm_kmeans.ipynb | 10 +- ...q_dataframes_ml_drug_name_generation.ipynb | 881 +++++++++++++++--- .../generative_ai/large_language_models.ipynb | 132 ++- noxfile.py | 1 + 5 files changed, 1356 insertions(+), 199 deletions(-) diff --git a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb index b2966c404c..74a0d7b206 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "ur8xi4C7S06n" }, @@ -140,7 +140,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "2b4ef9b72d43" }, @@ -204,11 +204,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;31mERROR:\u001b[0m (gcloud.config.set) argument VALUE: Must be specified.\n", + "Usage: gcloud config set SECTION/PROPERTY VALUE [optional flags]\n", + " optional flags may be --help | --installation\n", + "\n", + "For detailed information on this command and its flags, run:\n", + " gcloud config set --help\n" + ] + } + ], "source": [ "PROJECT_ID = \"\" # @param {type:\"string\"}\n", "\n", @@ -229,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "eF-Twtc4XGem" }, @@ -273,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "254614fa0c46" }, @@ -295,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "603adbbf0532" }, @@ -316,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -338,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "NPPMuw2PXGeo" }, @@ -383,15 +396,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "sdjeXFwcHfl7" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job a3897125-4272-4817-a0e6-8e1a9e022b93 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "from bigframes.ml.llm import PaLM2TextGenerator\n", + "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "model = PaLM2TextGenerator()" + "model = GeminiTextGenerator()" ] }, { @@ -414,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "SchiTkQGIJog" }, @@ -435,22 +461,172 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "QCqgVCIsGGuv" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job e186a7bf-813c-4c46-80c8-ae079c829841 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 868ef0e0-ef33-4f0c-8b47-401a82bfc288 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
API
0values
1dtypes
\n", + "

2 rows Ă— 1 columns

\n", + "
[2 rows x 1 columns in total]" + ], + "text/plain": [ + " API\n", + "0 values\n", + "1 dtypes\n", + "\n", + "[2 rows x 1 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_api.head(2)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "id": "BGJnZbgEGS5-" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job fcf9e1e9-cd3f-4a34-ba42-450c818bd6c7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c7b5f6a2-a5ca-4a4a-bcf3-9ddaa0a3777c is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
API
0shape
1size
\n", + "

2 rows Ă— 1 columns

\n", + "
[2 rows x 1 columns in total]" + ], + "text/plain": [ + " API\n", + "0 shape\n", + "1 size\n", + "\n", + "[2 rows x 1 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "series_api.head(2)" ] @@ -481,11 +657,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "id": "EDAaIwHpQCDZ" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 67d4d2d9-dd57-4886-8bcb-68e9eb6e11e2 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 26124cec-8753-4b48-b467-5e17c2c3591e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 9bdb0d90-60ec-4eec-96f4-990c3e1adef5 is DONE. 132 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "0 Generate Pandas sample code for DataFrame.values\n", + "1 Generate Pandas sample code for DataFrame.dtypes\n", + "Name: API, dtype: string" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_prompt_prefix = \"Generate Pandas sample code for DataFrame.\"\n", "series_prompt_prefix = \"Generate Pandas sample code for Series.\"\n", @@ -511,11 +736,84 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "id": "6i6HkFJZa8na" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job f0199b1e-5524-48ba-81ec-89d70c28b5d0 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job f1816195-25fa-4180-96ce-7917e9729428 is DONE. 584 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job ce1ad8d4-3fcd-4ca9-9f9b-4be0cfdabde5 is DONE. 146 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 3b245a41-a86e-4773-aa14-8edaa821c6b7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job d3ceafe9-888d-4f5e-b7f3-c2218dae0736 is DONE. 904 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 930c2334-60ac-4ec1-8a06-2a4cf2d9dc1e is DONE. 226 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024)\n", "series_pred = model.predict(series_prompt.to_frame(), max_output_tokens=1024)" @@ -532,11 +830,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "id": "9A2gw6hP_2nX" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 10a766a8-7368-4a82-b239-764e1c13ed64 is DONE. 21.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "```python\n", + "import pandas as pd\n", + "\n", + "# Create a DataFrame\n", + "df = pd.DataFrame({\n", + " \"Name\": [\"John\", \"Mary\", \"Peter\"],\n", + " \"Age\": [20, 25, 30],\n", + " \"City\": [\"New York\", \"London\", \"Paris\"]\n", + "})\n", + "\n", + "# Get the values as a NumPy array\n", + "values = df.values\n", + "\n", + "# Print the values\n", + "print(values)\n", + "```\n" + ] + } + ], "source": [ "print(df_pred['ml_generate_text_llm_result'].iloc[0])" ] @@ -566,7 +899,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "id": "GskyyUQPowBT" }, @@ -595,11 +928,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "id": "PBlp-C-DOHRO" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloud Function Name projects/bigframes-dev/locations/us-central1/functions/bigframes-3a8781216c4ccdded9eecfdbd72c63f2\n", + "Remote Function Name bigframes-dev._76f0f906c2e04e83c3496619541347a5922c80ee.bigframes_3a8781216c4ccdded9eecfdbd72c63f2\n" + ] + } + ], "source": [ "CLOUD_FUNCTION_NAME = format(extract_code.bigframes_cloud_function)\n", "print(\"Cloud Function Name \" + CLOUD_FUNCTION_NAME)\n", @@ -618,7 +960,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "id": "bsQ9cmoWo0Ps" }, @@ -639,11 +981,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "id": "7yWzjhGy_zcy" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 96bea1ea-9c98-42e9-8f6d-a2b6cdeaf17a is DONE. 21.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import bigframes.pandas as bf\n", + "\n", + "# Create a DataFrame\n", + "df = pd.DataFrame({\n", + " \"Name\": [\"John\", \"Mary\", \"Peter\"],\n", + " \"Age\": [20, 25, 30],\n", + " \"City\": [\"New York\", \"London\", \"Paris\"]\n", + "})\n", + "\n", + "# Get the values as a NumPy array\n", + "values = df.values\n", + "\n", + "# Print the values\n", + "print(values)\n", + "\n" + ] + } + ], "source": [ "print(df_code['code'].iloc[0])" ] @@ -670,11 +1046,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "id": "-J5LHgS6LLZ0" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating gs://code-samples-d1d466b7-dfe6-11ee-b86e-4201c0a82d52/...\n" + ] + } + ], "source": [ "import uuid\n", "BUCKET_ID = \"code-samples-\" + str(uuid.uuid1())\n", @@ -693,11 +1077,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "id": "Zs_b5L-4IvER" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 72fe0ca6-2f37-457f-9705-ce89b2a4c324 is DONE. 21.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 1b21be65-8761-4694-932e-8fa634569e56 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 510566f8-05f2-4455-8daa-f24feea0344e is DONE. 27.0 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 82c0e7f6-ce99-462c-a7d3-e760391f6677 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "df_code[[\"code\"]].to_csv(f\"gs://{BUCKET_ID}/df_code*.csv\")\n", "series_code[[\"code\"]].to_csv(f\"gs://{BUCKET_ID}/series_code*.csv\")" @@ -716,11 +1149,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "id": "PspCXu-qu_ND" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://console.developers.google.com/storage/browser/code-samples-d1d466b7-dfe6-11ee-b86e-4201c0a82d52//n" + ] + } + ], "source": [ "print(f'https://console.developers.google.com/storage/browser/%7BBUCKET_ID%7D/')" ] @@ -754,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "id": "yw7A461XLjvW" }, @@ -770,7 +1211,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "id": "sx_vKniMq9ZX" }, @@ -785,7 +1226,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "id": "iQFo6OUBLmi3" }, @@ -805,6 +1246,18 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb index 14a681a693..221933c2f8 100644 --- a/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb +++ b/notebooks/generative_ai/bq_dataframes_llm_kmeans.ipynb @@ -61,7 +61,7 @@ "\n", "1. Use PaLM2TextEmbeddingGenerator to [generate text embeddings](https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings) for each of 10000 complaints sent to an online bank. If you're not familiar with what a text embedding is, it's a list of numbers that are like coordinates in an imaginary \"meaning space\" for sentences. (It's like [word embeddings](https://en.wikipedia.org/wiki/Word_embedding), but for more general text.) The important point for our purposes is that similar sentences are close to each other in this imaginary space.\n", "2. Use KMeans clustering to group together complaints whose text embeddings are near to eachother. This will give us sets of similar complaints, but we don't yet know _why_ these complaints are similar.\n", - "3. Prompt PaLM2TextGenerator in English asking what the difference is between the groups of complaints that we got. Thanks to the power of modern LLMs, the response might give us a very good idea of what these complaints are all about, but remember to [\"understand the limits of your dataset and model.\"](https://ai.google/responsibility/responsible-ai-practices/#:~:text=Understand%20the%20limitations%20of%20your%20dataset%20and%20model)\n", + "3. Prompt GeminiTextGenerator in English asking what the difference is between the groups of complaints that we got. Thanks to the power of modern LLMs, the response might give us a very good idea of what these complaints are all about, but remember to [\"understand the limits of your dataset and model.\"](https://ai.google/responsibility/responsible-ai-practices/#:~:text=Understand%20the%20limitations%20of%20your%20dataset%20and%20model)\n", "\n", "We will tie these pieces together in Python using BigQuery DataFrames. [Click here](https://cloud.google.com/bigquery/docs/dataframes-quickstart) to learn more about BigQuery DataFrames!" ] @@ -894,7 +894,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Build prompts - we will choose just two of our categories and prompt PaLM2TextGenerator to identify their salient characteristics. The prompt is natural language in a python string." + "Build prompts - we will choose just two of our categories and prompt GeminiTextGenerator to identify their salient characteristics. The prompt is natural language in a python string." ] }, { @@ -1121,9 +1121,9 @@ } ], "source": [ - "from bigframes.ml.llm import PaLM2TextGenerator\n", + "from bigframes.ml.llm import GeminiTextGenerator\n", "\n", - "q_a_model = PaLM2TextGenerator()" + "q_a_model = GeminiTextGenerator()" ] }, { @@ -1216,7 +1216,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We now see PaLM2TextGenerator's characterization of the different comment groups. Thanks for using BigQuery DataFrames!" + "We now see GeminiTextGenerator's characterization of the different comment groups. Thanks for using BigQuery DataFrames!" ] }, { diff --git a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb index 52a1c4e768..8c0b1b0038 100644 --- a/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb +++ b/notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "ur8xi4C7S06n" }, @@ -92,7 +92,7 @@ "1. Use `bigframes` to query the FDA dataset of over 100,000 drugs, filtered on the brand name, generic name, and indications & usage columns.\n", "1. Filter this dataset to find prototypical brand names that can be used as examples in prompt tuning.\n", "1. Create a prompt with the user input, general instructions, examples and counter-examples for the desired brand name.\n", - "1. Use the `bigframes.ml.llm.PaLM2TextGenerator` to generate choices of brand names." + "1. Use the `bigframes.ml.llm.GeminiTextGenerator` to generate choices of brand names." ] }, { @@ -138,13 +138,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "2b4ef9b72d43" }, "outputs": [], "source": [ - "!pip install -U --quiet bigframes" + "# !pip install -U --quiet bigframes" ] }, { @@ -158,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "f200f10a1da3" }, @@ -182,15 +182,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "PyQmSRbKA8r-" }, "outputs": [], "source": [ "import bigframes.pandas as bpd\n", - "from google.cloud import bigquery_connection_v1 as bq_connection\n", - "from bigframes.ml.llm import PaLM2TextGenerator\n", + "from bigframes.ml.llm import GeminiTextGenerator\n", "from IPython.display import Markdown" ] }, @@ -226,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "254614fa0c46" }, @@ -246,7 +245,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "603adbbf0532" }, @@ -294,13 +293,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1;31mERROR:\u001b[0m (gcloud.config.set) argument VALUE: Must be specified.\n", + "Usage: gcloud config set SECTION/PROPERTY VALUE [optional flags]\n", + " optional flags may be --help | --installation\n", + "\n", + "For detailed information on this command and its flags, run:\n", + " gcloud config set --help\n" + ] + } + ], "source": [ - "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "# Please fill in these values.\n", + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", "\n", "# Set the project id\n", "! gcloud config set project {PROJECT_ID}" @@ -320,17 +333,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "id": "G1vVsPiMsL2X" }, "outputs": [], "source": [ "# Please fill in these values.\n", - "LOCATION = \"us\" # @param {type:\"string\"}\n", - "CONNECTION = \"\" # @param {type:\"string\"}\n", - "\n", - "connection_name = f\"{PROJECT_ID}.{LOCATION}.{CONNECTION}\"" + "LOCATION = \"us\" # @param {type:\"string\"}" ] }, { @@ -342,50 +352,6 @@ "We will now try to use the provided connection, and if it doesn't exist, create a new one. We will also print the service account used." ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "56Hw42m6kFrj" - }, - "outputs": [], - "source": [ - "# Initialize client and set request parameters\n", - "client = bq_connection.ConnectionServiceClient()\n", - "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{LOCATION}\"\n", - "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{LOCATION}/connections/{CONNECTION}\"\n", - "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n", - "\n", - "# Try to connect using provided connection\n", - "try:\n", - " request = client.get_connection(\n", - " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n", - " )\n", - " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n", - "# Create a new connection on error\n", - "except Exception:\n", - " connection = bq_connection.types.Connection(\n", - " {\"friendly_name\": CONNECTION, \"cloud_resource\": cloud_resource_properties}\n", - " )\n", - " request = bq_connection.CreateConnectionRequest(\n", - " {\n", - " \"parent\": new_conn_parent,\n", - " \"connection_id\": CONNECTION,\n", - " \"connection\": connection,\n", - " }\n", - " )\n", - " response = client.create_connection(request)\n", - " CONN_SERVICE_ACCOUNT = (\n", - " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n", - " )\n", - "# Set service account permissions\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n", - "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'\n", - "\n", - "print(CONN_SERVICE_ACCOUNT)" - ] - }, { "cell_type": "markdown", "metadata": { @@ -399,7 +365,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "id": "OCccLirpkSRz" }, @@ -422,7 +388,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "id": "oxphj2gnuKou" }, @@ -445,11 +411,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "id": "0knz5ZWMzed-" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Provide 10 unique and modern brand names in Markdown bullet point format. Do not provide any additional explanation.\n", + "\n", + "Be creative with the brand names. Don't use English words directly; use variants or invented words.\n", + "\n", + "The generic name is: Entropofloxacin\n", + "\n", + "The indications and usage are: Entropofloxacin is a fluoroquinolone antibiotic that is used to treat a variety of bacterial infections, including: pneumonia, streptococcus infections, salmonella infections, escherichia coli infections, and pseudomonas aeruginosa infections It is taken by mouth or by injection. The dosage and frequency of administration will vary depending on the type of infection being treated. It should be taken for the full course of treatment, even if symptoms improve after a few days. Stopping the medication early may increase the risk of the infection coming back..\n" + ] + } + ], "source": [ "zero_shot_prompt = f\"\"\"Provide {NUM_NAMES} unique and modern brand names in Markdown bullet point format. Do not provide any additional explanation.\n", "\n", @@ -464,19 +444,15 @@ }, { "cell_type": "markdown", - "metadata": { - "id": "LCRE2L720f5y" - }, + "metadata": {}, "source": [ - "Next, let's create a helper function to predict with our model. It will take a string input, and add it to a temporary BigFrames `DataFrame`. It will also return the string extracted from the response `DataFrame`." + "Next, let's create a helper function to predict with our model. It will take a string input, and add it to a temporary BigFrames DataFrame. It will also return the string extracted from the response DataFrame." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "LB3xgDroIxlx" - }, + "execution_count": 12, + "metadata": {}, "outputs": [], "source": [ "def predict(prompt: str, temperature: float = TEMPERATURE) -> str:\n", @@ -488,7 +464,7 @@ " )\n", "\n", " # Return response\n", - " return model.predict(input, temperature).ml_generate_text_llm_result.iloc[0]" + " return model.predict(input, temperature=temperature).ml_generate_text_llm_result.iloc[0]" ] }, { @@ -502,20 +478,100 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": { "id": "UW2fQ2k5Hsic" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 25b47284-2b28-4cd9-ac9a-90379f818c84 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0efa6f42-6569-4274-ac21-667c7eecefc7 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job c5e98170-7d58-4aa2-a3a3-6680cd9a54c0 is DONE. 8 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 5fd9d5bf-c731-4b21-b7c9-9b6244ffb412 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 36f7e8ec-ee42-4f94-8e38-bdf18b371517 is DONE. 118 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "- Etherealox\n", + "- Zenithrox\n", + "- Aureox\n", + "- Lucentrox\n", + "- Aethrox\n", + "- Luminex\n", + "- Elysirox\n", + "- Quasarox\n", + "- Novaflux\n", + "- Arcanox" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Get BigFrames session\n", - "session = bpd.get_global_session()\n", - "\n", "# Define the model\n", - "model = PaLM2TextGenerator(session=session, connection_name=connection_name)\n", + "model = GeminiTextGenerator()\n", "\n", "# Invoke LLM with prompt\n", - "response = predict(zero_shot_prompt)\n", + "response = predict(zero_shot_prompt, temperature = TEMPERATURE)\n", "\n", "# Print results as Markdown\n", "Markdown(response)" @@ -552,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": { "id": "MXdI78SOElyt" }, @@ -574,11 +630,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "id": "aQ2iscnhF2cx" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Provide 10 unique and modern brand names in Markdown bullet point format, related to the drug at the bottom of this prompt.\n", + "\n", + "Be creative with the brand names. Don't use English words directly; use variants or invented words.\n", + "\n", + "First, we will provide 3 examples to help with your thought process.\n", + "\n", + "Then, we will provide the generic name and usage for the drug we'd like you to generate brand names for.\n", + "\n" + ] + } + ], "source": [ "prefix_prompt = f\"\"\"Provide {NUM_NAMES} unique and modern brand names in Markdown bullet point format, related to the drug at the bottom of this prompt.\n", "\n", @@ -605,11 +676,139 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "id": "IoO_Bp8wA07N" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 542b0ce1-9d56-456f-bcd3-d24a6f0c825a is DONE. 84.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 2405ba41-b263-46d3-a0e5-3b5e7ecef6ab is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job b24663ec-8d81-4295-84df-ffb65a6a0f1b is DONE. 3.1 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openfda_generic_nameopenfda_brand_nameindications_and_usage
0BENZALKONIUM CHLORIDEmeijer kidsUse - hand washing to decrease bacteria on skin
3OCTINOXATE, TITANIUM DIOXIDECD DIORSKIN STAR Studio Makeup Spectacular Bri...Uses Helps prevent sunburn. If used as directe...
4TRIAMCINOLONE ACETONIDETriamcinolone AcetonideINDICATIONS AND USAGE Triamcinolone Acetonide ...
5BACITRACIN ZINC, NEOMYCIN SULFATE, POLYMYXIN B...Triple AntibioticFirst aid to help prevent infection in minor c...
6RISPERIDONERisperidone1. INDICATIONS AND USAGE Risperidone is an aty...
\n", + "

5 rows Ă— 3 columns

\n", + "
[5 rows x 3 columns in total]" + ], + "text/plain": [ + " openfda_generic_name \\\n", + "0 BENZALKONIUM CHLORIDE \n", + "3 OCTINOXATE, TITANIUM DIOXIDE \n", + "4 TRIAMCINOLONE ACETONIDE \n", + "5 BACITRACIN ZINC, NEOMYCIN SULFATE, POLYMYXIN B... \n", + "6 RISPERIDONE \n", + "\n", + " openfda_brand_name \\\n", + "0 meijer kids \n", + "3 CD DIORSKIN STAR Studio Makeup Spectacular Bri... \n", + "4 Triamcinolone Acetonide \n", + "5 Triple Antibiotic \n", + "6 Risperidone \n", + "\n", + " indications_and_usage \n", + "0 Use - hand washing to decrease bacteria on skin \n", + "3 Uses Helps prevent sunburn. If used as directe... \n", + "4 INDICATIONS AND USAGE Triamcinolone Acetonide ... \n", + "5 First aid to help prevent infection in minor c... \n", + "6 1. INDICATIONS AND USAGE Risperidone is an aty... \n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Query 3 columns of interest from drug label dataset\n", "df = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", @@ -636,7 +835,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": { "id": "95WDe2eCCeLx" }, @@ -663,11 +862,89 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { "id": "2ohZYg7QEyJV" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 293c90e0-7fdf-4769-9d8e-f222f35d368e is DONE. 84.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openfda_generic_nameopenfda_brand_nameindications_and_usage
81748AMPICILLIN SODIUMAmpicillinINDICATIONS AND USAGE Ampicillin for Injection...
730AZTREONAMCayston1 INDICATIONS AND USAGE CAYSTON® is indicated ...
71763TERAZOSIN HYDROCHLORIDETerazosinINDICATIONS AND USAGE Terazosin capsules are i...
\n", + "
" + ], + "text/plain": [ + " openfda_generic_name openfda_brand_name \\\n", + "81748 AMPICILLIN SODIUM Ampicillin \n", + "730 AZTREONAM Cayston \n", + "71763 TERAZOSIN HYDROCHLORIDE Terazosin \n", + "\n", + " indications_and_usage \n", + "81748 INDICATIONS AND USAGE Ampicillin for Injection... \n", + "730 1 INDICATIONS AND USAGE CAYSTON® is indicated ... \n", + "71763 INDICATIONS AND USAGE Terazosin capsules are i... " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take a sample and convert to a Pandas dataframe for local usage.\n", "df_examples = df.sample(NUM_EXAMPLES, random_state=3).to_pandas()\n", @@ -686,11 +963,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": { "id": "PcJdSaw0EGcW" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'brand_name': 'Ampicillin', 'generic_name': 'AMPICILLIN SODIUM', 'usage': 'INDICATIONS AND USAGE Ampicillin for Injection, USP is indicated in the treatment of infections caused by susceptible strains of the designated organisms in the following conditions: Respiratory Tract Infections caused by Streptococcus pneumoniae. Staphylococcus aureus (penicillinase and nonpenicillinase-producing), H. influenzae, and Group A beta-hemolytic streptococci. Bacterial Meningitis caused by E. coli, Group B streptococci, and other Gram-negative bacteria (Listeria monocytogenes, N. meningitidis). The addition of an aminoglycoside with ampicillin may increase its effectiveness against Gram-negative bacteria. Septicemia and Endocarditis caused by susceptible Gram-positive organisms including Streptococcus spp., penicillin G-susceptible staphylococci, and enterococci. Gram-negative sepsis caused by E. coli, Proteus mirabilis and Salmonella spp. responds to ampicillin. Endocarditis due to enterococcal strains usually respond to intravenous therapy. The addition of an aminoglycoside may enhance the effectiveness of ampicillin when treating streptococcal endocarditis. Urinary Tract Infections caused by sensitive strains of E. coli and Proteus mirabilis. Gastrointestinal Infections caused by Salmonella typhi (typhoid fever), other Salmonella spp., and Shigella spp. (dysentery) usually respond to oral or intravenous therapy. Bacteriology studies to determine the causative organisms and their susceptibility to ampicillin should be performed. Therapy may be instituted prior to obtaining results of susceptibility testing. It is advisable to reserve the parenteral form of this drug for moderately severe and severe infections and for patients who are unable to take the oral forms. A change to oral ampicillin may be made as soon as appropriate. To reduce the development of drug-resistant bacteria and maintain the effectiveness of Ampicillin for Injection, USP and other antibacterial drugs, Ampicillin for Injection, USP should be used only to treat or prevent infections that are proven or strongly suspected to be caused by susceptible bacteria. When culture and susceptibility information are available, they should be considered in selecting or modifying antibacterial therapy. In the absence of such data, local epidemiology and susceptibility patterns may contribute to the empiric selection of therapy. Indicated surgical procedures should be performed.'}, {'brand_name': 'Cayston', 'generic_name': 'AZTREONAM', 'usage': '1 INDICATIONS AND USAGE CAYSTON® is indicated to improve respiratory symptoms in cystic fibrosis (CF) patients with Pseudomonas aeruginosa. Safety and effectiveness have not been established in pediatric patients below the age of 7 years, patients with FEV1 <25% or >75% predicted, or patients colonized with Burkholderia cepacia [see Clinical Studies (14) ]. To reduce the development of drug-resistant bacteria and maintain the effectiveness of CAYSTON and other antibacterial drugs, CAYSTON should be used only to treat patients with CF known to have Pseudomonas aeruginosa in the lungs. CAYSTON is a monobactam antibacterial indicated to improve respiratory symptoms in cystic fibrosis (CF) patients with Pseudomonas aeruginosa. Safety and effectiveness have not been established in pediatric patients below the age of 7 years, patients with FEV1 <25% or >75% predicted, or patients colonized with Burkholderia cepacia. (1)'}, {'brand_name': 'Terazosin', 'generic_name': 'TERAZOSIN HYDROCHLORIDE', 'usage': 'INDICATIONS AND USAGE Terazosin capsules are indicated for the treatment of symptomatic benign prostatic hyperplasia (BPH). There is a rapid response, with approximately 70% of patients experiencing an increase in urinary flow and improvement in symptoms of BPH when treated with terazosin capsules. The long-term effects of terazosin capsules on the incidence of surgery, acute urinary obstruction or other complications of BPH are yet to be determined. Terazosin capsules are also indicated for the treatment of hypertension. Terazosin capsules can be used alone or in combination with other antihypertensive agents such as diuretics or beta-adrenergic blocking agents.'}]\n" + ] + } + ], "source": [ "examples = [\n", " {\n", @@ -719,11 +1004,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "id": "kzAVsF6wJ93S" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Generic name: AMPICILLIN SODIUM\\nUsage: INDICATIONS AND USAGE Ampicillin for Injection, USP is indicated in the treatment of infections caused by susceptible strains of the designated organisms in the following conditions: Respiratory Tract Infections caused by Streptococcus pneumoniae. Staphylococcus aureus (penicillinase and nonpenicillinase-producing), H. influenzae, and Group A beta-hemolytic streptococci. Bacterial Meningitis caused by E. coli, Group B streptococci, and other Gram-negative bacteria (Listeria monocytogenes, N. meningitidis). The addition of an aminoglycoside with ampicillin may increase its effectiveness against Gram-negative bacteria. Septicemia and Endocarditis caused by susceptible Gram-positive organisms including Streptococcus spp., penicillin G-susceptible staphylococci, and enterococci. Gram-negative sepsis caused by E. coli, Proteus mirabilis and Salmonella spp. responds to ampicillin. Endocarditis due to enterococcal strains usually respond to intravenous therapy. The addition of an aminoglycoside may enhance the effectiveness of ampicillin when treating streptococcal endocarditis. Urinary Tract Infections caused by sensitive strains of E. coli and Proteus mirabilis. Gastrointestinal Infections caused by Salmonella typhi (typhoid fever), other Salmonella spp., and Shigella spp. (dysentery) usually respond to oral or intravenous therapy. Bacteriology studies to determine the causative organisms and their susceptibility to ampicillin should be performed. Therapy may be instituted prior to obtaining results of susceptibility testing. It is advisable to reserve the parenteral form of this drug for moderately severe and severe infections and for patients who are unable to take the oral forms. A change to oral ampicillin may be made as soon as appropriate. To reduce the development of drug-resistant bacteria and maintain the effectiveness of Ampicillin for Injection, USP and other antibacterial drugs, Ampicillin for Injection, USP should be used only to treat or prevent infections that are proven or strongly suspected to be caused by susceptible bacteria. When culture and susceptibility information are available, they should be considered in selecting or modifying antibacterial therapy. In the absence of such data, local epidemiology and susceptibility patterns may contribute to the empiric selection of therapy. Indicated surgical procedures should be performed.\\nBrand name: Ampicillin\\n\\nGeneric name: AZTREONAM\\nUsage: 1 INDICATIONS AND USAGE CAYSTON® is indicated to improve respiratory symptoms in cystic fibrosis (CF) patients with Pseudomonas aeruginosa. Safety and effectiveness have not been established in pediatric patients below the age of 7 years, patients with FEV1 <25% or >75% predicted, or patients colonized with Burkholderia cepacia [see Clinical Studies (14) ]. To reduce the development of drug-resistant bacteria and maintain the effectiveness of CAYSTON and other antibacterial drugs, CAYSTON should be used only to treat patients with CF known to have Pseudomonas aeruginosa in the lungs. CAYSTON is a monobactam antibacterial indicated to improve respiratory symptoms in cystic fibrosis (CF) patients with Pseudomonas aeruginosa. Safety and effectiveness have not been established in pediatric patients below the age of 7 years, patients with FEV1 <25% or >75% predicted, or patients colonized with Burkholderia cepacia. (1)\\nBrand name: Cayston\\n\\nGeneric name: TERAZOSIN HYDROCHLORIDE\\nUsage: INDICATIONS AND USAGE Terazosin capsules are indicated for the treatment of symptomatic benign prostatic hyperplasia (BPH). There is a rapid response, with approximately 70% of patients experiencing an increase in urinary flow and improvement in symptoms of BPH when treated with terazosin capsules. The long-term effects of terazosin capsules on the incidence of surgery, acute urinary obstruction or other complications of BPH are yet to be determined. Terazosin capsules are also indicated for the treatment of hypertension. Terazosin capsules can be used alone or in combination with other antihypertensive agents such as diuretics or beta-adrenergic blocking agents.\\nBrand name: Terazosin\\n\\n'" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "example_prompt = \"\"\n", "for example in examples:\n", @@ -743,11 +1039,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "id": "OYp6W_XfHTlo" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generic name: Entropofloxacin\n", + "Usage: Entropofloxacin is a fluoroquinolone antibiotic that is used to treat a variety of bacterial infections, including: pneumonia, streptococcus infections, salmonella infections, escherichia coli infections, and pseudomonas aeruginosa infections It is taken by mouth or by injection. The dosage and frequency of administration will vary depending on the type of infection being treated. It should be taken for the full course of treatment, even if symptoms improve after a few days. Stopping the medication early may increase the risk of the infection coming back.\n", + "Brand names:\n" + ] + } + ], "source": [ "suffix_prompt = f\"\"\"Generic name: {GENERIC_NAME}\n", "Usage: {USAGE}\n", @@ -767,11 +1073,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "id": "99xdU7l8C1h8" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Provide 10 unique and modern brand names in Markdown bullet point format, related to the drug at the bottom of this prompt.\n", + "\n", + "Be creative with the brand names. Don't use English words directly; use variants or invented words.\n", + "\n", + "First, we will provide 3 examples to help with your thought process.\n", + "\n", + "Then, we will provide the generic name and usage for the drug we'd like you to generate brand names for.\n", + "Generic name: AMPICILLIN SODIUM\n", + "Usage: INDICATIONS AND USAGE Ampicillin for Injection, USP is indicated in the treatment of infections caused by susceptible strains of the designated organisms in the following conditions: Respiratory Tract Infections caused by Streptococcus pneumoniae. Staphylococcus aureus (penicillinase and nonpenicillinase-producing), H. influenzae, and Group A beta-hemolytic streptococci. Bacterial Meningitis caused by E. coli, Group B streptococci, and other Gram-negative bacteria (Listeria monocytogenes, N. meningitidis). The addition of an aminoglycoside with ampicillin may increase its effectiveness against Gram-negative bacteria. Septicemia and Endocarditis caused by susceptible Gram-positive organisms including Streptococcus spp., penicillin G-susceptible staphylococci, and enterococci. Gram-negative sepsis caused by E. coli, Proteus mirabilis and Salmonella spp. responds to ampicillin. Endocarditis due to enterococcal strains usually respond to intravenous therapy. The addition of an aminoglycoside may enhance the effectiveness of ampicillin when treating streptococcal endocarditis. Urinary Tract Infections caused by sensitive strains of E. coli and Proteus mirabilis. Gastrointestinal Infections caused by Salmonella typhi (typhoid fever), other Salmonella spp., and Shigella spp. (dysentery) usually respond to oral or intravenous therapy. Bacteriology studies to determine the causative organisms and their susceptibility to ampicillin should be performed. Therapy may be instituted prior to obtaining results of susceptibility testing. It is advisable to reserve the parenteral form of this drug for moderately severe and severe infections and for patients who are unable to take the oral forms. A change to oral ampicillin may be made as soon as appropriate. To reduce the development of drug-resistant bacteria and maintain the effectiveness of Ampicillin for Injection, USP and other antibacterial drugs, Ampicillin for Injection, USP should be used only to treat or prevent infections that are proven or strongly suspected to be caused by susceptible bacteria. When culture and susceptibility information are available, they should be considered in selecting or modifying antibacterial therapy. In the absence of such data, local epidemiology and susceptibility patterns may contribute to the empiric selection of therapy. Indicated surgical procedures should be performed.\n", + "Brand name: Ampicillin\n", + "\n", + "Generic name: AZTREONAM\n", + "Usage: 1 INDICATIONS AND USAGE CAYSTON® is indicated to improve respiratory symptoms in cystic fibrosis (CF) patients with Pseudomonas aeruginosa. Safety and effectiveness have not been established in pediatric patients below the age of 7 years, patients with FEV1 <25% or >75% predicted, or patients colonized with Burkholderia cepacia [see Clinical Studies (14) ]. To reduce the development of drug-resistant bacteria and maintain the effectiveness of CAYSTON and other antibacterial drugs, CAYSTON should be used only to treat patients with CF known to have Pseudomonas aeruginosa in the lungs. CAYSTON is a monobactam antibacterial indicated to improve respiratory symptoms in cystic fibrosis (CF) patients with Pseudomonas aeruginosa. Safety and effectiveness have not been established in pediatric patients below the age of 7 years, patients with FEV1 <25% or >75% predicted, or patients colonized with Burkholderia cepacia. (1)\n", + "Brand name: Cayston\n", + "\n", + "Generic name: TERAZOSIN HYDROCHLORIDE\n", + "Usage: INDICATIONS AND USAGE Terazosin capsules are indicated for the treatment of symptomatic benign prostatic hyperplasia (BPH). There is a rapid response, with approximately 70% of patients experiencing an increase in urinary flow and improvement in symptoms of BPH when treated with terazosin capsules. The long-term effects of terazosin capsules on the incidence of surgery, acute urinary obstruction or other complications of BPH are yet to be determined. Terazosin capsules are also indicated for the treatment of hypertension. Terazosin capsules can be used alone or in combination with other antihypertensive agents such as diuretics or beta-adrenergic blocking agents.\n", + "Brand name: Terazosin\n", + "\n", + "Generic name: Entropofloxacin\n", + "Usage: Entropofloxacin is a fluoroquinolone antibiotic that is used to treat a variety of bacterial infections, including: pneumonia, streptococcus infections, salmonella infections, escherichia coli infections, and pseudomonas aeruginosa infections It is taken by mouth or by injection. The dosage and frequency of administration will vary depending on the type of infection being treated. It should be taken for the full course of treatment, even if symptoms improve after a few days. Stopping the medication early may increase the risk of the infection coming back.\n", + "Brand names:\n" + ] + } + ], "source": [ "# Define the prompt\n", "few_shot_prompt = prefix_prompt + example_prompt + suffix_prompt\n", @@ -791,11 +1126,82 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 42, "metadata": { "id": "d4ODRJdvLhlQ" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 5c6c3b79-812c-4a6e-876e-ca1ff6230a6e is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 168d5859-5edb-4702-8192-838ac2c7bc17 is DONE. 8 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 72f07348-4bcd-4042-84ca-396e7651ad03 is DONE. 2 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 70863a3b-8c63-423c-84cd-2804139daf5f is DONE. 679 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "- **Aerion:** (Derived from \"aer\" meaning air)\n", + "- **Aquazone:** (Combining \"aqua\" for water and \"zone\" for area)\n", + "- **Biosphere:** (Inspired by the concept of a self-contained ecosystem)\n", + "- **Celestial:** (Evoking the vastness and healing power of the universe)\n", + "- **Ethereal:** (Conveying a sense of lightness and transcendence)\n", + "- **Luminary:** (From \"lumen\" meaning light, symbolizing hope and healing)\n", + "- **Quasar:** (Inspired by the powerful and distant cosmic objects)\n", + "- **Sanctuary:** (Creating a sense of safety and refuge)\n", + "- **Zenith:** (Reaching the highest point or peak)\n", + "- **Zephyr:** (Named after the gentle west wind, representing a calming and soothing effect)" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "response = predict(few_shot_prompt)\n", "\n", @@ -817,11 +1223,139 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": { "id": "8eAutS41mx6U" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job b73f92bb-0e58-4fe4-adfb-b948fc5f4647 is DONE. 84.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 392dae36-aacb-4753-b28c-dad8291cb153 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 7c6ff6ee-db64-4629-a417-846dcecac127 is DONE. 6.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
openfda_generic_nameopenfda_brand_nameindications_and_usage
89MEPHITIS MEPHITICAMEPHITIS MEPHITICAINDICATIONS Condition listed above or as direc...
105ONDANSETRONONDANSETRON1 INDICATIONS AND USAGE Ondansetron Injection,...
124CLOFARABINECLOFARABINE1 INDICATIONS AND USAGE Clofarabine injection ...
273ACETAMINOPHEN AND DIPHENHYDRAMINE HYDROCHLORIDEACETAMINOPHEN AND DIPHENHYDRAMINE HYDROCHLORIDEUses Temporary relief of occasional headaches ...
284OFLOXACINOFLOXACININDICATIONS AND USAGE To reduce the developmen...
\n", + "

5 rows Ă— 3 columns

\n", + "
[5 rows x 3 columns in total]" + ], + "text/plain": [ + " openfda_generic_name \\\n", + "89 MEPHITIS MEPHITICA \n", + "105 ONDANSETRON \n", + "124 CLOFARABINE \n", + "273 ACETAMINOPHEN AND DIPHENHYDRAMINE HYDROCHLORIDE \n", + "284 OFLOXACIN \n", + "\n", + " openfda_brand_name \\\n", + "89 MEPHITIS MEPHITICA \n", + "105 ONDANSETRON \n", + "124 CLOFARABINE \n", + "273 ACETAMINOPHEN AND DIPHENHYDRAMINE HYDROCHLORIDE \n", + "284 OFLOXACIN \n", + "\n", + " indications_and_usage \n", + "89 INDICATIONS Condition listed above or as direc... \n", + "105 1 INDICATIONS AND USAGE Ondansetron Injection,... \n", + "124 1 INDICATIONS AND USAGE Clofarabine injection ... \n", + "273 Uses Temporary relief of occasional headaches ... \n", + "284 INDICATIONS AND USAGE To reduce the developmen... \n", + "\n", + "[5 rows x 3 columns]" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Query 3 columns of interest from drug label dataset\n", "df_missing = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", @@ -851,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": { "id": "19TvGN1PVmVX" }, @@ -878,16 +1412,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": { "id": "tiSHa5B4aFhw" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job d216bea6-9b9c-4918-9194-40de2745beca is DONE. 84.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 37d88636-b1fb-44da-9504-44144af9624d is DONE. 800 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 0b35db83-5bac-47b4-8a2c-b46a816c0e3e is DONE. 200 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "def batch_predict(\n", " input: bpd.DataFrame, temperature: float = TEMPERATURE\n", ") -> bpd.DataFrame:\n", - " return model.predict(input, temperature).ml_generate_text_llm_result\n", + " return model.predict(input, temperature=temperature).ml_generate_text_llm_result\n", "\n", "\n", "response = batch_predict(df_missing[\"prompt\"])" @@ -904,19 +1475,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": { "id": "TnizdeqBdbZj" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 4397b5f3-5058-409c-a361-c9fa715e46ee is DONE. 84.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 147ea301-e249-49fb-8280-d61948d5df7f is DONE. 84.4 MB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 067a2a73-0f36-42a6-973e-074ab8be631a is DONE. 56.7 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Generic name: MEPHITIS MEPHITICA\n", + "Brand name: INDICATIONS Condition listed above or as directed by the physician\n", + "Response: **Ephemeral** (Latin root: \"ephemerus,\" meaning \"lasting for a day\")\n", + "\n", + "**Aetheria** (Greek root: \"aither,\" meaning \"upper air, sky\")\n", + "\n", + "**Zenithar** (Combination of \"zenith\" and \"pharma\")\n", + "\n", + "**Celestian** (Latin root: \"celestial,\" meaning \"heavenly\")\n", + "\n", + "**Astralux** (Combination of \"astral\" and \"lux,\" meaning \"light\")\n" + ] + } + ], "source": [ "# Pick a sample\n", "k = 0\n", "\n", "# Gather the prompt and response details\n", - "prompt_generic = df_missing[\"openfda_generic_name\"][k].iloc[0]\n", - "prompt_usage = df_missing[\"indications_and_usage\"][k].iloc[0]\n", - "response_str = response[k].iloc[0]\n", + "prompt_generic = df_missing[\"openfda_generic_name\"].iloc[k]\n", + "prompt_usage = df_missing[\"indications_and_usage\"].iloc[k]\n", + "response_str = response.iloc[k]\n", "\n", "# Print details\n", "print(f\"Generic name: {prompt_generic}\")\n", @@ -934,36 +1559,6 @@ "\n", "You've also seen how BigFrames can manage each step of the process, including gathering data, data manipulation, and querying the LLM." ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Bys6--dVmq7R" - }, - "source": [ - "## Cleaning up\n", - "\n", - "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", - "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", - "\n", - "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cIODjOLump_-" - }, - "outputs": [], - "source": [ - "# Delete the BigQuery Connection\n", - "from google.cloud import bigquery_connection_v1 as bq_connection\n", - "client = bq_connection.ConnectionServiceClient()\n", - "CONNECTION_ID = f\"projects/{PROJECT_ID}/locations/{LOCATION}/connections/{CONNECTION}\"\n", - "client.delete_connection(name=CONNECTION_ID)\n", - "print(f\"Deleted connection {CONNECTION_ID}.\")" - ] } ], "metadata": { @@ -973,6 +1568,10 @@ "kernelspec": { "display_name": "Python 3", "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.9" } }, "nbformat": 4, diff --git a/notebooks/generative_ai/large_language_models.ipynb b/notebooks/generative_ai/large_language_models.ipynb index 2695ee9dc0..08ef52b544 100644 --- a/notebooks/generative_ai/large_language_models.ipynb +++ b/notebooks/generative_ai/large_language_models.ipynb @@ -2,13 +2,13 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import bigframes.pandas\n", "import pandas as pd\n", - "from bigframes.ml.llm import PaLM2TextGenerator" + "from bigframes.ml.llm import GeminiTextGenerator" ] }, { @@ -22,9 +22,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/garrettwu/src/bigframes/bigframes/session/__init__.py:1762: UserWarning: No explicit location is set, so using location US for the session.\n", + " return Session(context)\n" + ] + } + ], "source": [ "session = bigframes.pandas.get_global_session()\n", "connection = f\"{session.bqclient.project}.us.bigframes-default-connection\"" @@ -42,9 +51,22 @@ "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "Query job 12bcd690-ca99-4001-bf26-032f50e77d62 is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "model = PaLM2TextGenerator(session=session, connection_name=connection)" + "model = GeminiTextGenerator(session=session, connection_name=connection)" ] }, { @@ -83,6 +105,54 @@ "execution_count": 5, "metadata": {}, "outputs": [ + { + "data": { + "text/html": [ + "Query job f8fe31c6-7d8a-4919-9492-8304a0083cca is DONE. 0 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 28bab71f-e218-4d92-9a50-dab41bb0c71f is DONE. 24 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job 01d66b61-459f-474e-9f66-d519f9c2f23d is DONE. 6 Bytes processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "Query job af606ca7-4bcf-4bd1-95fd-c516542b5a4f is DONE. 5.3 kB processed. Open Job" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ @@ -105,30 +175,64 @@ " \n", " \n", " ml_generate_text_llm_result\n", + " ml_generate_text_rai_result\n", + " ml_generate_text_status\n", + " prompt\n", " \n", " \n", " \n", " \n", " 0\n", - " BigQuery is a fully managed, petabyte-scale an...\n", + " **BigQuery**\n", + "\n", + "**Definition:**\n", + "\n", + "BigQuery is a s...\n", + " null\n", + " \n", + " What is BigQuery?\n", " \n", " \n", " 1\n", - " BQML stands for BigQuery Machine Learning. It ...\n", + " **BigQuery Machine Learning (BQML)**\n", + "\n", + "BQML is ...\n", + " null\n", + " \n", + " What is BQML?\n", " \n", " \n", " 2\n", - " A BigQuery DataFrames is a distributed collecti...\n", + " BigQuery DataFrame is a Python DataFrame imple...\n", + " null\n", + " \n", + " What is BigQuery DataFrame?\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ml_generate_text_llm_result\n", - "0 BigQuery is a fully managed, petabyte-scale an...\n", - "1 BQML stands for BigQuery Machine Learning. It ...\n", - "2 A BigQuery DataFrames is a distributed collecti..." + " ml_generate_text_llm_result \\\n", + "0 **BigQuery**\n", + "\n", + "**Definition:**\n", + "\n", + "BigQuery is a s... \n", + "1 **BigQuery Machine Learning (BQML)**\n", + "\n", + "BQML is ... \n", + "2 BigQuery DataFrame is a Python DataFrame imple... \n", + "\n", + " ml_generate_text_rai_result ml_generate_text_status \\\n", + "0 null \n", + "1 null \n", + "2 null \n", + "\n", + " prompt \n", + "0 What is BigQuery? \n", + "1 What is BQML? \n", + "2 What is BigQuery DataFrame? " ] }, "execution_count": 5, @@ -157,7 +261,7 @@ { "data": { "text/plain": [ - "'BigQuery is a fully managed, petabyte-scale analytics data warehouse that enables businesses to analyze all their data very quickly. It is a cloud-based service that offers a pay-as-you-go pricing model. BigQuery is designed to handle large amounts of data and provide fast performance. It is a good choice for businesses that need to analyze large amounts of data quickly and easily.'" + "'**BigQuery**\\n\\n**Definition:**\\n\\nBigQuery is a serverless, highly scalable, cloud-based data warehouse and analytics platform offered by Google Cloud.\\n\\n**Key Features:**\\n\\n* **Massive Scalability:** Can handle large datasets (petabytes or more) with fast query execution.\\n* **Elastic:** Automatically scales compute resources based on workload requirements.\\n* **Serverless:** Users do not need to manage infrastructure or provision resources.\\n* **Flexible Data Loading:** Supports a wide range of data sources, including files, databases, and streaming data.\\n* **SQL-Based Querying:** Uses standard SQL syntax for querying and analyzing data.\\n* **Machine Learning Integration:** Provides built-in machine learning capabilities for predictive analytics and data exploration.\\n* **Real-Time Analysis:** Supports streaming data analysis and interactive dashboards.\\n* **Collaboration and Sharing:** Allows multiple users to access and analyze data in a collaborative environment.\\n* **Cost-Effective:** Pay-as-you-go pricing based on data scanned and compute resources used.\\n\\n**Applications:**\\n\\n* Data warehousing and analytics\\n* Business intelligence and reporting\\n* Data science and machine learning\\n* Data exploration and visualization\\n* Marketing analytics\\n* Fraud detection and risk management\\n\\n**Benefits:**\\n\\n* Rapid data analysis on large datasets\\n* Reduced infrastructure management overhead\\n* Increased agility and flexibility\\n* Enhanced collaboration and data sharing\\n* Cost-effective data storage and analytics'" ] }, "execution_count": 6, diff --git a/noxfile.py b/noxfile.py index e7f238c01f..1d8ab6c1fd 100644 --- a/noxfile.py +++ b/noxfile.py @@ -716,6 +716,7 @@ def notebook(session: nox.Session): # TODO(swast): investigate why we get 404 errors, even though # bq_dataframes_llm_code_generation creates a bucket in the sample. "notebooks/generative_ai/bq_dataframes_llm_code_generation.ipynb", # Needs BUCKET_URI. + "notebooks/generative_ai/sentiment_analysis.ipynb", # Too slow "notebooks/vertex_sdk/sdk2_bigframes_pytorch.ipynb", # Needs BUCKET_URI. "notebooks/vertex_sdk/sdk2_bigframes_sklearn.ipynb", # Needs BUCKET_URI. "notebooks/vertex_sdk/sdk2_bigframes_tensorflow.ipynb", # Needs BUCKET_URI. From cba21ba8533835cd2286c8b89ac4b7bf144bfce4 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 17:55:09 -0700 Subject: [PATCH 21/21] chore(main): release 0.24.0 (#411) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 30 ++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35eaa3688d..565fe43241 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,36 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.24.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.23.0...v0.24.0) (2024-03-12) + + +### âš  BREAKING CHANGES + +* `read_parquet` uses a "pandas" engine to parse files by default. Use `engine="bigquery"` for the previous behavior + +### Features + +* (Series|Dataframe).plot.hist() ([#420](https://github.com/googleapis/python-bigquery-dataframes/issues/420)) ([4aadff4](https://github.com/googleapis/python-bigquery-dataframes/commit/4aadff4db59243b4510a874fef2bdb17402d1674)) +* Add detect_anomalies to ml ARIMAPlus and KMeans models ([#426](https://github.com/googleapis/python-bigquery-dataframes/issues/426)) ([6df28ed](https://github.com/googleapis/python-bigquery-dataframes/commit/6df28ed704552ebec7869e1f2034614cb6407098)) +* Add engine parameter to `read_parquet` ([#413](https://github.com/googleapis/python-bigquery-dataframes/issues/413)) ([31325a1](https://github.com/googleapis/python-bigquery-dataframes/commit/31325a190320bf01ced53d9f4cdb94462daaa06b)) +* Add ml PCA.detect_anomalies method ([#422](https://github.com/googleapis/python-bigquery-dataframes/issues/422)) ([8d82945](https://github.com/googleapis/python-bigquery-dataframes/commit/8d8294544ac7fedaca753c5473e3ca2a27868420)) +* Support BYOSA in `remote_function` ([#407](https://github.com/googleapis/python-bigquery-dataframes/issues/407)) ([d92ced2](https://github.com/googleapis/python-bigquery-dataframes/commit/d92ced2adaa30a0405ace9ca6cd70a8e217f13d0)) +* Support CMEK for BQ tables ([#403](https://github.com/googleapis/python-bigquery-dataframes/issues/403)) ([9a678e3](https://github.com/googleapis/python-bigquery-dataframes/commit/9a678e35201d935e1d93875429005033cfe7cff6)) + + +### Bug Fixes + +* Move `third_party.bigframes_vendored` to `bigframes_vendored` ([#424](https://github.com/googleapis/python-bigquery-dataframes/issues/424)) ([763edeb](https://github.com/googleapis/python-bigquery-dataframes/commit/763edeb4f4e8bc4b8bb05a992dae80c49c245e25)) +* Only do row identity based joins when joining by index ([#356](https://github.com/googleapis/python-bigquery-dataframes/issues/356)) ([76b252f](https://github.com/googleapis/python-bigquery-dataframes/commit/76b252f907055d72556e3e95f6cb5ee41de5b1c2)) +* Read_pandas inline respects location ([#412](https://github.com/googleapis/python-bigquery-dataframes/issues/412)) ([ae0e3ea](https://github.com/googleapis/python-bigquery-dataframes/commit/ae0e3eaca49171fd449de4d43ddc3e3ce9fdc2ce)) + + +### Documentation + +* Add predict sample to samples/snippets/bqml_getting_started_test.py ([#388](https://github.com/googleapis/python-bigquery-dataframes/issues/388)) ([6a3b0cc](https://github.com/googleapis/python-bigquery-dataframes/commit/6a3b0cc7f84120fc5978ce11b6b7c55e89654304)) +* Document minimum IAM requirement ([#416](https://github.com/googleapis/python-bigquery-dataframes/issues/416)) ([36173b0](https://github.com/googleapis/python-bigquery-dataframes/commit/36173b0c14747fb52909bbedd93249024bae9ac1)) +* Fix the note rendering for DataFrames methods: nlargest, nsmallest ([#417](https://github.com/googleapis/python-bigquery-dataframes/issues/417)) ([38bd2ba](https://github.com/googleapis/python-bigquery-dataframes/commit/38bd2ba21bc1a3222635de22eecd97930bf5b1de)) + ## [0.23.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.22.0...v0.23.0) (2024-03-05) diff --git a/bigframes/version.py b/bigframes/version.py index a50b0b86fd..ae18e113ef 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.23.0" +__version__ = "0.24.0"