From 8b5ffa8893b51016c51794865c40def74ea6716b Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Tue, 4 Feb 2025 15:19:57 -0600 Subject: [PATCH 01/22] feat: add `bigframes.bigquery.st_area` and suggest it from `GeoSeries.area` (#1318) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: Add GeoSeries.area * fix formatting * fix import error * fix format and return type * update test * add type ignore flag * add and test st_area * update goseries notebook with area and st_area * ignore mypy error * update the notebook * update the notebook * ignore exception in notebook * update test data and add comment * Update tests/system/small/bigquery/test_geo.py * Update tests/system/small/bigquery/test_geo.py --------- Co-authored-by: Tim Sweña (Swast) --- bigframes/bigquery/__init__.py | 3 + bigframes/bigquery/_operations/geo.py | 93 ++++++++ bigframes/core/compile/scalar_op_compiler.py | 5 + bigframes/geopandas/geoseries.py | 27 +++ bigframes/operations/__init__.py | 3 +- bigframes/operations/geo_ops.py | 7 + notebooks/geo/geoseries.ipynb | 223 +++++++++++++++--- tests/system/small/bigquery/test_geo.py | 53 +++++ .../system/small/geopandas/test_geoseries.py | 26 ++ .../bigframes_vendored/geopandas/geoseries.py | 11 +- 10 files changed, 410 insertions(+), 41 deletions(-) create mode 100644 bigframes/bigquery/_operations/geo.py create mode 100644 tests/system/small/bigquery/test_geo.py diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 21e61bc4b1..56aee38bfe 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -27,6 +27,7 @@ unix_millis, unix_seconds, ) +from bigframes.bigquery._operations.geo import st_area from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, @@ -45,6 +46,8 @@ "array_length", "array_agg", "array_to_string", + # geo ops + "st_area", # json ops "json_set", "json_extract", diff --git a/bigframes/bigquery/_operations/geo.py b/bigframes/bigquery/_operations/geo.py new file mode 100644 index 0000000000..262ced4fe8 --- /dev/null +++ b/bigframes/bigquery/_operations/geo.py @@ -0,0 +1,93 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from bigframes import operations as ops +import bigframes.geopandas +import bigframes.series + +""" +Search functions defined from +https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions +""" + + +def st_area(self) -> bigframes.series.Series: + """ + Returns the area in square meters covered by the polygons in the input + GEOGRAPHY. + + If geography_expression is a point or a line, returns zero. If + geography_expression is a collection, returns the area of the polygons + in the collection; if the collection doesn't contain polygons, returns zero. + + + ..note:: + BigQuery's Geography functions, like `st_area`, interpet the geomtry + data type as a point set on the Earth's surface. A point set is a set + of points, lines, and polygons on the WGS84 reference spheroid, with + geodesic edges. See: https://cloud.google.com/bigquery/docs/geospatial-data + + + **Examples:** + + >>> import bigframes.geopandas + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> from shapely.geometry import Polygon, LineString, Point + >>> bpd.options.display.progress_bar = None + + >>> series = bigframes.geopandas.GeoSeries( + ... [ + ... Polygon([(0.0, 0.0), (0.1, 0.1), (0.0, 0.1)]), + ... Polygon([(0.10, 0.4), (0.9, 0.5), (0.10, 0.5)]), + ... Polygon([(0.1, 0.1), (0.2, 0.1), (0.2, 0.2)]), + ... LineString([(0, 0), (1, 1), (0, 1)]), + ... Point(0, 1), + ... ] + ... ) + >>> series + 0 POLYGON ((0 0, 0.1 0.1, 0 0.1, 0 0)) + 1 POLYGON ((0.1 0.4, 0.9 0.5, 0.1 0.5, 0.1 0.4)) + 2 POLYGON ((0.1 0.1, 0.2 0.1, 0.2 0.2, 0.1 0.1)) + 3 LINESTRING (0 0, 1 1, 0 1) + 4 POINT (0 1) + dtype: geometry + + >>> bbq.st_area(series) + 0 61821689.855985 + 1 494563347.88721 + 2 61821689.855841 + 3 0.0 + 4 0.0 + dtype: Float64 + + Use `round()` to round the outputed areas to the neares ten millions + + >>> bbq.st_area(series).round(-7) + 0 60000000.0 + 1 490000000.0 + 2 60000000.0 + 3 0.0 + 4 0.0 + dtype: Float64 + + Returns: + bigframes.pandas.Series: + Series of float representing the areas. + """ + series = self._apply_unary_op(ops.geo_area_op) + series.name = None + return series diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index b42f983619..78c3c23abd 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -993,6 +993,11 @@ def geo_y_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).y() +@scalar_op_compiler.register_unary_op(ops.geo_area_op) +def geo_area_op_impl(x: ibis_types.Value): + return typing.cast(ibis_types.GeoSpatialValue, x).area() + + # Parameterized ops @scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index 7a5b24f413..bc0482f60d 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import annotations +import bigframes_vendored.constants as constants import bigframes_vendored.geopandas.geoseries as vendored_geoseries import geopandas.array # type: ignore @@ -39,3 +40,29 @@ def y(self) -> bigframes.series.Series: series = self._apply_unary_op(ops.geo_y_op) series.name = None return series + + # GeoSeries.area overrides Series.area with something totally different. + # Ignore this type error, as we are trying to be as close to geopandas as + # we can. + @property + def area(self, crs=None) -> bigframes.series.Series: # type: ignore + """Returns a Series containing the area of each geometry in the GeoSeries + expressed in the units of the CRS. + + Args: + crs (optional): + Coordinate Reference System of the geometry objects. Can be + anything accepted by pyproj.CRS.from_user_input(), such as an + authority string (eg “EPSG:4326”) or a WKT string. + + Returns: + bigframes.pandas.Series: + Series of float representing the areas. + + Raises: + NotImplementedError: + GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead. + """ + raise NotImplementedError( + f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" + ) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index d8b0447686..da1fc36cae 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -84,7 +84,7 @@ SqlScalarOp, where_op, ) -from bigframes.operations.geo_ops import geo_x_op, geo_y_op +from bigframes.operations.geo_ops import geo_area_op, geo_x_op, geo_y_op from bigframes.operations.json_ops import ( JSONExtract, JSONExtractArray, @@ -332,6 +332,7 @@ # Geo ops "geo_x_op", "geo_y_op", + "geo_area_op", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index 73e7e89197..bc14fa611b 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -29,3 +29,10 @@ dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" ), ) + +geo_area_op = base_ops.create_unary_op( + name="geo_area", + type_signature=op_typing.FixedOutputType( + dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" + ), +) diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index 160d19ce91..7dc4c596ca 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -37,7 +37,6 @@ "import bigframes\n", "import bigframes.geopandas\n", "import bigframes.pandas as bpd\n", - "import shapely\n", "bpd.options.display.progress_bar = None" ] }, @@ -57,7 +56,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:274: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" ] } @@ -98,21 +97,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "54 POINT (-93.47523 45.00612)\n", - "256 POINT (-89.60507 42.67552)\n", - "266 POINT (-104.11408 39.31516)\n", - "485 POINT (-91.23193 32.34688)\n", - "765 POINT (-83.42808 38.20427)\n", + "37 POINT (-91.19496 39.98605)\n", + "406 POINT (-84.86717 33.92103)\n", + "926 POINT (-82.47974 35.33641)\n", + "940 POINT (-75.50298 39.09709)\n", + "996 POINT (-92.56434 39.8298)\n", "Name: int_point_geom, dtype: geometry" ] }, - "execution_count": 12, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -129,13 +128,6 @@ "### Convert the five geo points to `bigframes.gopandas.GeoSeries`" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Note: TypeError is raised if the GEOGRAPHY column contains geometry type other than `Point`." - ] - }, { "cell_type": "code", "execution_count": 6, @@ -144,11 +136,11 @@ { "data": { "text/plain": [ - "0 POINT (-86.87338 38.37334)\n", - "1 POINT (-118.48037 46.25461)\n", - "2 POINT (-92.5617 32.30429)\n", - "3 POINT (-83.46189 39.55525)\n", - "4 POINT (-119.46779 47.21363)\n", + "0 POINT (-91.19496 39.98605)\n", + "1 POINT (-84.86717 33.92103)\n", + "2 POINT (-82.47974 35.33641)\n", + "3 POINT (-75.50298 39.09709)\n", + "4 POINT (-92.56434 39.8298)\n", "dtype: geometry" ] }, @@ -171,6 +163,13 @@ "### Retrieve the x (longitude) and y (latitude) from the GeoSeries with `.x` and `.y`." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Note: TypeError is raised if `.x` and `.y` are used with a geometry type other than `Point`." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -186,11 +185,11 @@ { "data": { "text/plain": [ - "0 -86.873385\n", - "1 -118.48037\n", - "2 -92.5617\n", - "3 -83.461893\n", - "4 -119.467788\n", + "0 -91.194961\n", + "1 -84.867169\n", + "2 -82.479741\n", + "3 -75.502982\n", + "4 -92.56434\n", "dtype: Float64" ] }, @@ -218,11 +217,11 @@ { "data": { "text/plain": [ - "0 38.373344\n", - "1 46.254606\n", - "2 32.30429\n", - "3 39.555246\n", - "4 47.213633\n", + "0 39.986053\n", + "1 33.92103\n", + "2 35.336415\n", + "3 39.097088\n", + "4 39.829795\n", "dtype: Float64" ] }, @@ -251,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -285,7 +284,7 @@ "dtype: Float64" ] }, - "execution_count": 13, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -303,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -337,7 +336,7 @@ "dtype: Float64" ] }, - "execution_count": 14, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -345,6 +344,160 @@ "source": [ "point_geom_series.geo.y" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Retrive the `area` of different geometry shapes. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Create a geometry collection from local data with `Peek`" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10 POLYGON ((-101.7778 40.34969, -101.77812 40.34...\n", + "127 POLYGON ((-89.22333 44.50398, -89.22334 44.499...\n", + "253 POLYGON ((-76.69446 37.07288, -76.69515 37.072...\n", + "261 POLYGON ((-98.70136 44.45055, -98.70136 44.450...\n", + "303 POLYGON ((-85.99565 30.28131, -85.99566 30.280...\n", + "Name: county_geom, dtype: geometry" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_series = df[\"county_geom\"].peek(n = 5)\n", + "geom_series" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Convert the geometry collection to `bigframes.gopandas.GeoSeries`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POLYGON ((-101.7778 40.34969, -101.77812 40.34...\n", + "1 POLYGON ((-89.22333 44.50398, -89.22334 44.499...\n", + "2 POLYGON ((-76.69446 37.07288, -76.69515 37.072...\n", + "3 POLYGON ((-98.70136 44.45055, -98.70136 44.450...\n", + "4 POLYGON ((-85.99565 30.28131, -85.99566 30.280...\n", + "dtype: geometry" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "five_geom = bigframes.geopandas.GeoSeries(\n", + " [point for point in geom_series]\n", + ")\n", + "five_geom" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "source": [ + "## Note: `bigframes.geopandas.GeoSeries.area` raises NotImplementedError. " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "tags": [ + "raises-exception" + ] + }, + "outputs": [ + { + "ename": "NotImplementedError", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.34.0", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", + "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:66\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 68\u001b[0m )\n", + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.34.0" + ] + } + ], + "source": [ + "five_geom.area" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use `bigframes.bigquery.st_area` to retirive the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import bigframes.bigquery as bbq" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2382382043.48891\n", + "1 1977633097.26862\n", + "2 939388839.499466\n", + "3 3269015229.381782\n", + "4 2678752241.321673\n", + "dtype: Float64" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "geom_area = bbq.st_area(five_geom)\n", + "geom_area" + ] } ], "metadata": { diff --git a/tests/system/small/bigquery/test_geo.py b/tests/system/small/bigquery/test_geo.py new file mode 100644 index 0000000000..7d38cd7d91 --- /dev/null +++ b/tests/system/small/bigquery/test_geo.py @@ -0,0 +1,53 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import geopandas # type: ignore +import pandas as pd +from shapely.geometry import LineString, Point, Polygon # type: ignore + +import bigframes.bigquery as bbq +import bigframes.geopandas +import bigframes.series + + +def test_geo_st_area(): + data = [ + Polygon([(0.000, 0.0), (0.001, 0.001), (0.000, 0.001)]), + Polygon([(0.0010, 0.004), (0.009, 0.005), (0.0010, 0.005)]), + Polygon([(0.001, 0.001), (0.002, 0.001), (0.002, 0.002)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ] + + geopd_s = geopandas.GeoSeries(data=data, crs="EPSG:4326") + geobf_s = bigframes.geopandas.GeoSeries(data=data) + + # For `geopd_s`, the data was further projected with `geopandas.GeoSeries.to_crs` + # to `to_crs(26393)` to get the area in square meter. See: https://geopandas.org/en/stable/docs/user_guide/projections.html + # and https://spatialreference.org/ref/epsg/26393/. We then rounded both results + # to get them as close to each other as possible. Initially, the area results + # were +ten-millions. We added more zeros after the decimal point to round the + # area results to the nearest thousands. + geopd_s_result = geopd_s.to_crs(26393).area.round(-3) + geobf_s_result = bbq.st_area(geobf_s).to_pandas().round(-3) + assert geobf_s_result.iloc[0] >= 1000 + + pd.testing.assert_series_equal( + geobf_s_result, + geopd_s_result, + check_dtype=False, + check_index_type=False, + check_exact=False, + rtol=1, + ) diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index a30460d461..2967e4d247 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -12,10 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import re + +import bigframes_vendored.constants as constants import geopandas # type: ignore +from geopandas.array import GeometryDtype # type:ignore import google.api_core.exceptions import pandas as pd import pytest +from shapely.geometry import LineString, Point, Polygon # type: ignore import bigframes.geopandas import bigframes.series @@ -61,3 +66,24 @@ def test_geo_y(urban_areas_dfs): pd_result.astype(pd.Float64Dtype()), bf_result, ) + + +def test_geo_area_not_supported(): + s = bigframes.pandas.Series( + [ + Polygon([(0, 0), (1, 1), (0, 1)]), + Polygon([(10, 0), (10, 5), (0, 0)]), + Polygon([(0, 0), (2, 2), (2, 0)]), + LineString([(0, 0), (1, 1), (0, 1)]), + Point(0, 1), + ], + dtype=GeometryDtype(), + ) + bf_series: bigframes.geopandas.GeoSeries = s.geo + with pytest.raises( + NotImplementedError, + match=re.escape( + f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" + ), + ): + bf_series.area diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index 2ad35ed852..d84dec94a8 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -17,8 +17,9 @@ class GeoSeries: >>> import bigframes.geopandas >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> from shapely.geometry import Point + >>> bpd.options.display.progress_bar = None + >>> s = bigframes.geopandas.GeoSeries([Point(1, 1), Point(2, 2), Point(3, 3)]) >>> s 0 POINT (1 1) @@ -43,9 +44,9 @@ def x(self) -> bigframes.series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import geopandas.array >>> import shapely + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.Point(1, 2), shapely.Point(2, 3), shapely.Point(3, 4)], @@ -58,7 +59,7 @@ def x(self) -> bigframes.series.Series: dtype: Float64 Returns: - bigframes.series.Series: + bigframes.pandas.Series: Return the x location (longitude) of point geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @@ -70,9 +71,9 @@ def y(self) -> bigframes.series.Series: **Examples:** >>> import bigframes.pandas as bpd - >>> bpd.options.display.progress_bar = None >>> import geopandas.array >>> import shapely + >>> bpd.options.display.progress_bar = None >>> series = bpd.Series( ... [shapely.Point(1, 2), shapely.Point(2, 3), shapely.Point(3, 4)], @@ -85,7 +86,7 @@ def y(self) -> bigframes.series.Series: dtype: Float64 Returns: - bigframes.series.Series: + bigframes.pandas.Series: Return the y location (latitude) of point geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From e7493c826794d578f6317bd37d5d1b047803b38c Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Tue, 4 Feb 2025 18:35:29 -0800 Subject: [PATCH 02/22] Revert "chore: remove 10t benchmark temporarily (#1341)" (#1363) This reverts commit ce7d92f9decd11de2235348d9fa478ef8a049d84. --- tests/benchmark/tpch/config.jsonl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/benchmark/tpch/config.jsonl b/tests/benchmark/tpch/config.jsonl index 779b0fe2d7..e6f7a444f6 100644 --- a/tests/benchmark/tpch/config.jsonl +++ b/tests/benchmark/tpch/config.jsonl @@ -6,3 +6,5 @@ {"benchmark_suffix": "100g_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0100g", "ordered": false} {"benchmark_suffix": "1t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": true} {"benchmark_suffix": "1t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0001t", "ordered": false} +{"benchmark_suffix": "10t_ordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": true} +{"benchmark_suffix": "10t_unordered", "project_id": "bigframes-dev-perf", "dataset_id": "tpch_0010t", "ordered": false} From 7ae565d9e0e59fdf75c7659c0263562688ccc1e8 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Wed, 5 Feb 2025 09:32:36 -0800 Subject: [PATCH 03/22] perf: Simplify merge join key coalescing (#1361) --- bigframes/core/blocks.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 43f605dc03..b1f4ed35cc 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -2077,14 +2077,12 @@ def merge( result_columns = [] matching_join_labels = [] - coalesced_ids = [] - for left_id, right_id in zip(left_join_ids, right_join_ids): - joined_expr, coalesced_id = joined_expr.project_to_id( - ops.coalesce_op.as_expr( - get_column_left[left_id], get_column_right[right_id] - ), - ) - coalesced_ids.append(coalesced_id) + left_post_join_ids = tuple(get_column_left[id] for id in left_join_ids) + right_post_join_ids = tuple(get_column_right[id] for id in right_join_ids) + + joined_expr, coalesced_ids = coalesce_columns( + joined_expr, left_post_join_ids, right_post_join_ids, how=how, drop=False + ) for col_id in self.value_columns: if col_id in left_join_ids: @@ -2102,7 +2100,6 @@ def merge( result_columns.append(get_column_left[col_id]) for col_id in other.value_columns: if col_id in right_join_ids: - key_part = right_join_ids.index(col_id) if other.col_id_to_label[matching_right_id] in matching_join_labels: pass else: @@ -2928,26 +2925,31 @@ def resolve_label_id(label: Label) -> str: ) +# TODO: Rewrite just to return expressions def coalesce_columns( expr: core.ArrayValue, left_ids: typing.Sequence[str], right_ids: typing.Sequence[str], how: str, + drop: bool = True, ) -> Tuple[core.ArrayValue, Sequence[str]]: result_ids = [] for left_id, right_id in zip(left_ids, right_ids): if how == "left" or how == "inner" or how == "cross": result_ids.append(left_id) - expr = expr.drop_columns([right_id]) + if drop: + expr = expr.drop_columns([right_id]) elif how == "right": result_ids.append(right_id) - expr = expr.drop_columns([left_id]) + if drop: + expr = expr.drop_columns([left_id]) elif how == "outer": coalesced_id = guid.generate_guid() expr, coalesced_id = expr.project_to_id( ops.coalesce_op.as_expr(left_id, right_id) ) - expr = expr.drop_columns([left_id, right_id]) + if drop: + expr = expr.drop_columns([left_id, right_id]) result_ids.append(coalesced_id) else: raise ValueError(f"Unexpected join type: {how}. {constants.FEEDBACK_LINK}") From b9bdca8285ee54fecf3795fbf3cbea6f878ee8ca Mon Sep 17 00:00:00 2001 From: Chelsea Lin Date: Wed, 5 Feb 2025 11:14:18 -0800 Subject: [PATCH 04/22] fix: dtype parameter ineffective in Series/DataFrame construction (#1354) * fix: dtype parameter ineffective in Series IO * Revert "docs: update struct examples. (#953)" This reverts commit d632cd03e3e3ea6dfa7c56dd459c422e95be906e. * skip array tests because of dtype mismatches --- bigframes/dtypes.py | 5 +- tests/system/small/bigquery/test_json.py | 5 +- tests/system/small/test_dataframe.py | 13 ++++ tests/system/small/test_series.py | 74 +++++++++++++++++++ .../pandas/core/arrays/arrow/accessors.py | 44 +++++------ 5 files changed, 116 insertions(+), 25 deletions(-) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index 8b1ca3b0c8..b06046a027 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -295,7 +295,10 @@ def is_object_like(type_: Union[ExpressionType, str]) -> bool: # See: https://stackoverflow.com/a/40312924/101923 and # https://numpy.org/doc/stable/reference/generated/numpy.dtype.kind.html # for the way to identify object type. - return type_ in ("object", "O") or getattr(type_, "kind", None) == "O" + return type_ in ("object", "O") or ( + getattr(type_, "kind", None) == "O" + and getattr(type_, "storage", None) != "pyarrow" + ) def is_string_like(type_: ExpressionType) -> bool: diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index aa490749ae..8f97856eea 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -16,6 +16,7 @@ import geopandas as gpd # type: ignore import pandas as pd +import pyarrow as pa import pytest import bigframes.bigquery as bbq @@ -174,7 +175,7 @@ def test_json_extract_array_from_json_strings(): actual = bbq.json_extract_array(s, "$.a") expected = bpd.Series( [['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"'], None], - dtype=pd.StringDtype(storage="pyarrow"), + dtype=pd.ArrowDtype(pa.list_(pa.string())), ) pd.testing.assert_series_equal( actual.to_pandas(), @@ -190,7 +191,7 @@ def test_json_extract_array_from_json_array_strings(): actual = bbq.json_extract_array(s) expected = bpd.Series( [["1", "2", "3"], [], ["4", "5"]], - dtype=pd.StringDtype(storage="pyarrow"), + dtype=pd.ArrowDtype(pa.list_(pa.string())), ) pd.testing.assert_series_equal( actual.to_pandas(), diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index aa038c62d8..e7556043af 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -166,6 +166,19 @@ def test_df_construct_inline_respects_location(): assert table.location == "europe-west1" +def test_df_construct_dtype(): + data = { + "int_col": [1, 2, 3], + "string_col": ["1.1", "2.0", "3.5"], + "float_col": [1.0, 2.0, 3.0], + } + dtype = pd.StringDtype(storage="pyarrow") + bf_result = dataframe.DataFrame(data, dtype=dtype) + pd_result = pd.DataFrame(data, dtype=dtype) + pd_result.index = pd_result.index.astype("Int64") + pandas.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + def test_get_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index fb48bf58b4..cdda7c753d 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -26,6 +26,7 @@ import pytest import shapely # type: ignore +import bigframes.features import bigframes.pandas import bigframes.series as series from tests.system.utils import ( @@ -228,6 +229,79 @@ def test_series_construct_geodata(): ) +@pytest.mark.parametrize( + ("dtype"), + [ + pytest.param(pd.Int64Dtype(), id="int"), + pytest.param(pd.Float64Dtype(), id="float"), + pytest.param(pd.StringDtype(storage="pyarrow"), id="string"), + ], +) +def test_series_construct_w_dtype_for_int(dtype): + data = [1, 2, 3] + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + series = bigframes.pandas.Series(data, dtype=dtype) + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_struct(): + # The data shows the struct fields are disordered and correctly handled during + # construction. + data = [ + {"a": 1, "c": "pandas", "b": dt.datetime(2020, 1, 20, 20, 20, 20, 20)}, + {"a": 2, "c": "pandas", "b": dt.datetime(2019, 1, 20, 20, 20, 20, 20)}, + {"a": 1, "c": "numpy", "b": None}, + ] + dtype = pd.ArrowDtype( + pa.struct([("a", pa.int64()), ("c", pa.string()), ("b", pa.timestamp("us"))]) + ) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + pd.testing.assert_series_equal(series.to_pandas(), expected) + + +def test_series_construct_w_dtype_for_array_string(): + data = [["1", "2", "3"], [], ["4", "5"]] + dtype = pd.ArrowDtype(pa.list_(pa.string())) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + +def test_series_construct_w_dtype_for_array_struct(): + data = [[{"a": 1, "c": "aa"}, {"a": 2, "c": "bb"}], [], [{"a": 3, "c": "cc"}]] + dtype = pd.ArrowDtype(pa.list_(pa.struct([("a", pa.int64()), ("c", pa.string())]))) + series = bigframes.pandas.Series(data, dtype=dtype) + expected = pd.Series(data, dtype=dtype) + expected.index = expected.index.astype("Int64") + + # Skip dtype check due to internal issue b/321013333. This issue causes array types + # to be converted to the `object` dtype when calling `to_pandas()`, resulting in + # a mismatch with the expected Pandas type. + if bigframes.features.PANDAS_VERSIONS.is_arrow_list_dtype_usable: + check_dtype = True + else: + check_dtype = False + + pd.testing.assert_series_equal( + series.to_pandas(), expected, check_dtype=check_dtype + ) + + def test_series_keys(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_col"].keys().to_pandas() diff --git a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py index 771146250a..fe15e7b40d 100644 --- a/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py +++ b/third_party/bigframes_vendored/pandas/core/arrays/arrow/accessors.py @@ -87,12 +87,12 @@ def field(self, name_or_index: str | int): >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) @@ -106,7 +106,7 @@ def field(self, name_or_index: str | int): Extract by field index. - >>> s.struct.field(1) + >>> s.struct.field(0) 0 1 1 2 2 1 @@ -133,22 +133,22 @@ def explode(self): >>> bpd.options.display.progress_bar = None >>> s = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) Extract all child fields. >>> s.struct.explode() - project version - 0 pandas 1 - 1 pandas 2 - 2 numpy 1 + version project + 0 1 pandas + 1 2 pandas + 2 1 numpy [3 rows x 2 columns] @@ -178,8 +178,8 @@ def dtypes(self): ... )) ... ) >>> s.struct.dtypes() - project string[pyarrow] version Int64 + project string[pyarrow] dtype: object Returns: @@ -205,21 +205,21 @@ def explode(self, column, *, separator: str = "."): >>> countries = bpd.Series(["cn", "es", "us"]) >>> files = bpd.Series( ... [ - ... {"project": "pandas", "version": 1}, - ... {"project": "pandas", "version": 2}, - ... {"project": "numpy", "version": 1}, + ... {"version": 1, "project": "pandas"}, + ... {"version": 2, "project": "pandas"}, + ... {"version": 1, "project": "numpy"}, ... ], ... dtype=bpd.ArrowDtype(pa.struct( - ... [("project", pa.string()), ("version", pa.int64())] + ... [("version", pa.int64()), ("project", pa.string())] ... )) ... ) >>> downloads = bpd.Series([100, 200, 300]) >>> df = bpd.DataFrame({"country": countries, "file": files, "download_count": downloads}) >>> df.struct.explode("file") - country file.project file.version download_count - 0 cn pandas 1 100 - 1 es pandas 2 200 - 2 us numpy 1 300 + country file.version file.project download_count + 0 cn 1 pandas 100 + 1 es 2 pandas 200 + 2 us 1 numpy 300 [3 rows x 4 columns] From 86b7e72097ce67d88b72cfe031080d5af22f65cd Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Wed, 5 Feb 2025 12:34:31 -0800 Subject: [PATCH 05/22] chore: support timestamp subtractions (#1346) * chore: support timestamp subtractions * Fix format * use tree rewrites to dispatch timestamp_diff operator * add TODO for more node updates * polish the code and fix typos * fix comment * add rewrites to compile_raw and compile_peek_sql --- bigframes/core/compile/compiler.py | 4 + bigframes/core/compile/ibis_types.py | 2 +- bigframes/core/compile/scalar_op_compiler.py | 5 ++ bigframes/core/rewrite/__init__.py | 2 + bigframes/core/rewrite/operators.py | 82 +++++++++++++++++++ bigframes/dtypes.py | 2 +- bigframes/operations/__init__.py | 4 + bigframes/operations/datetime_ops.py | 19 +++++ bigframes/operations/numeric_ops.py | 5 +- bigframes/operations/timedelta_ops.py | 2 +- bigframes/series.py | 4 +- bigframes/session/loader.py | 4 +- .../system/small/operations/test_datetimes.py | 81 ++++++++++++++++++ 13 files changed, 208 insertions(+), 8 deletions(-) create mode 100644 bigframes/core/rewrite/operators.py diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index a72ca47190..dca204401e 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -58,6 +58,7 @@ def compile_sql( # TODO: get rid of output_ids arg assert len(output_ids) == len(list(node.fields)) node = set_output_names(node, output_ids) + node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) if ordered: node, limit = rewrites.pullup_limit_from_slice(node) node = nodes.bottom_up(node, rewrites.rewrite_slice) @@ -81,6 +82,7 @@ def compile_sql( def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: ids = [id.sql for id in node.ids] node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) @@ -93,6 +95,7 @@ def compile_raw( str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering ]: node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) ir = self.compile_node(node) sql = ir.to_sql() @@ -100,6 +103,7 @@ def compile_raw( def _preprocess(self, node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrites.rewrite_slice) + node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 8a55f6775d..78c2259cf0 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -79,7 +79,7 @@ BIGFRAMES_TO_IBIS: Dict[bigframes.dtypes.Dtype, ibis_dtypes.DataType] = { pandas: ibis for ibis, pandas in BIDIRECTIONAL_MAPPINGS } -BIGFRAMES_TO_IBIS.update({bigframes.dtypes.TIMEDETLA_DTYPE: ibis_dtypes.int64}) +BIGFRAMES_TO_IBIS.update({bigframes.dtypes.TIMEDELTA_DTYPE: ibis_dtypes.int64}) IBIS_TO_BIGFRAMES: Dict[ibis_dtypes.DataType, bigframes.dtypes.Dtype] = { ibis: pandas for ibis, pandas in BIDIRECTIONAL_MAPPINGS } diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 78c3c23abd..4739cc9a99 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -737,6 +737,11 @@ def unix_millis_op_impl(x: ibis_types.TimestampValue): return unix_millis(x) +@scalar_op_compiler.register_binary_op(ops.timestamp_diff_op) +def timestamp_diff_op_impl(x: ibis_types.TimestampValue, y: ibis_types.TimestampValue): + return x.delta(y, "microsecond") + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index 9044cb25f9..f93186bf36 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -15,6 +15,7 @@ from bigframes.core.rewrite.identifiers import remap_variables from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection +from bigframes.core.rewrite.operators import rewrite_timedelta_ops from bigframes.core.rewrite.order import pull_up_order from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice @@ -22,6 +23,7 @@ "legacy_join_as_projection", "try_row_join", "rewrite_slice", + "rewrite_timedelta_ops", "pullup_limit_from_slice", "remap_variables", "pull_up_order", diff --git a/bigframes/core/rewrite/operators.py b/bigframes/core/rewrite/operators.py new file mode 100644 index 0000000000..3145a9e9ae --- /dev/null +++ b/bigframes/core/rewrite/operators.py @@ -0,0 +1,82 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import dataclasses +import functools +import typing + +from bigframes import dtypes +from bigframes import operations as ops +from bigframes.core import expression as ex +from bigframes.core import nodes, schema + + +@dataclasses.dataclass +class _TypedExpr: + expr: ex.Expression + dtype: dtypes.Dtype + + +def rewrite_timedelta_ops(root: nodes.BigFrameNode) -> nodes.BigFrameNode: + """ + Rewrites expressions to properly handle timedelta values, because this type does not exist + in the SQL world. + """ + if isinstance(root, nodes.ProjectionNode): + updated_assignments = tuple( + (_rewrite_expressions(expr, root.schema).expr, column_id) + for expr, column_id in root.assignments + ) + root = nodes.ProjectionNode(root.child, updated_assignments) + + # TODO(b/394354614): FilterByNode and OrderNode also contain expressions. Need to update them too. + return root + + +@functools.cache +def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _TypedExpr: + if isinstance(expr, ex.DerefOp): + return _TypedExpr(expr, schema.get_type(expr.id.sql)) + + if isinstance(expr, ex.ScalarConstantExpression): + return _TypedExpr(expr, expr.dtype) + + if isinstance(expr, ex.OpExpression): + updated_inputs = tuple( + map(lambda x: _rewrite_expressions(x, schema), expr.inputs) + ) + return _rewrite_op_expr(expr, updated_inputs) + + raise AssertionError(f"Unexpected expression type: {type(expr)}") + + +def _rewrite_op_expr( + expr: ex.OpExpression, inputs: typing.Tuple[_TypedExpr, ...] +) -> _TypedExpr: + if isinstance(expr.op, ops.SubOp): + return _rewrite_sub_op(inputs[0], inputs[1]) + + input_types = tuple(map(lambda x: x.dtype, inputs)) + return _TypedExpr(expr, expr.op.output_type(*input_types)) + + +def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + result_op: ops.BinaryOp = ops.sub_op + if dtypes.is_datetime_like(left.dtype) and dtypes.is_datetime_like(right.dtype): + result_op = ops.timestamp_diff_op + + return _TypedExpr( + result_op.as_expr(left.expr, right.expr), + result_op.output_type(left.dtype, right.dtype), + ) diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index b06046a027..d5be2ca584 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -56,7 +56,7 @@ TIME_DTYPE = pd.ArrowDtype(pa.time64("us")) DATETIME_DTYPE = pd.ArrowDtype(pa.timestamp("us")) TIMESTAMP_DTYPE = pd.ArrowDtype(pa.timestamp("us", tz="UTC")) -TIMEDETLA_DTYPE = pd.ArrowDtype(pa.duration("us")) +TIMEDELTA_DTYPE = pd.ArrowDtype(pa.duration("us")) NUMERIC_DTYPE = pd.ArrowDtype(pa.decimal128(38, 9)) BIGNUMERIC_DTYPE = pd.ArrowDtype(pa.decimal256(76, 38)) # No arrow equivalent diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index da1fc36cae..ba8f3f64d7 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -49,6 +49,7 @@ date_op, StrftimeOp, time_op, + timestamp_diff_op, ToDatetimeOp, ToTimestampOp, UnixMicros, @@ -125,6 +126,7 @@ sinh_op, sqrt_op, sub_op, + SubOp, tan_op, tanh_op, unsafe_pow_op, @@ -246,6 +248,7 @@ # Datetime ops "date_op", "time_op", + "timestamp_diff_op", "ToDatetimeOp", "ToTimestampOp", "StrftimeOp", @@ -283,6 +286,7 @@ "sinh_op", "sqrt_op", "sub_op", + "SubOp", "tan_op", "tanh_op", "unsafe_pow_op", diff --git a/bigframes/operations/datetime_ops.py b/bigframes/operations/datetime_ops.py index 5086de27d3..3ea4c652f1 100644 --- a/bigframes/operations/datetime_ops.py +++ b/bigframes/operations/datetime_ops.py @@ -107,3 +107,22 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT if input_types[0] is not dtypes.TIMESTAMP_DTYPE: raise TypeError("expected timestamp input") return dtypes.INT_DTYPE + + +@dataclasses.dataclass(frozen=True) +class TimestampDiff(base_ops.BinaryOp): + name: typing.ClassVar[str] = "timestamp_diff" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + if input_types[0] is not input_types[1]: + raise TypeError( + f"two inputs have different types. left: {input_types[0]}, right: {input_types[1]}" + ) + + if not dtypes.is_datetime_like(input_types[0]): + raise TypeError("expected timestamp input") + + return dtypes.TIMEDELTA_DTYPE + + +timestamp_diff_op = TimestampDiff() diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 939330954d..413d8d66e1 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -141,7 +141,10 @@ def output_type(self, *input_types): ): # Numeric subtraction return dtypes.coerce_to_common(left_type, right_type) - # TODO: Add temporal addition once delta types supported + + if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type): + return dtypes.TIMEDELTA_DTYPE + raise TypeError(f"Cannot subtract dtypes {left_type} and {right_type}") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index 0bcd6eb08f..e212381557 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -28,4 +28,4 @@ class ToTimedeltaOp(base_ops.UnaryOp): def output_type(self, *input_types): if input_types[0] is not dtypes.INT_DTYPE: raise TypeError("expected integer input") - return dtypes.TIMEDETLA_DTYPE + return dtypes.TIMEDELTA_DTYPE diff --git a/bigframes/series.py b/bigframes/series.py index 706c0f4f09..af9fce6e20 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -805,10 +805,10 @@ def __rsub__(self, other: float | int | Series) -> Series: __rsub__.__doc__ = inspect.getdoc(vendored_pandas_series.Series.__rsub__) - def sub(self, other: float | int | Series) -> Series: + def sub(self, other) -> Series: return self._apply_binary_op(other, ops.sub_op) - def rsub(self, other: float | int | Series) -> Series: + def rsub(self, other) -> Series: return self._apply_binary_op(other, ops.sub_op, reverse=True) subtract = sub diff --git a/bigframes/session/loader.py b/bigframes/session/loader.py index ba693696c3..b7550583e5 100644 --- a/bigframes/session/loader.py +++ b/bigframes/session/loader.py @@ -177,7 +177,7 @@ def read_pandas_load_job( destination_table = self._bqclient.get_table(load_table_destination) col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - col: bigframes.dtypes.TIMEDETLA_DTYPE + col: bigframes.dtypes.TIMEDELTA_DTYPE for col in df_and_labels.timedelta_cols } array_value = core.ArrayValue.from_table( @@ -236,7 +236,7 @@ def read_pandas_streaming( ) col_type_overrides: typing.Dict[str, bigframes.dtypes.Dtype] = { - col: bigframes.dtypes.TIMEDETLA_DTYPE + col: bigframes.dtypes.TIMEDELTA_DTYPE for col in df_and_labels.timedelta_cols } array_value = ( diff --git a/tests/system/small/operations/test_datetimes.py b/tests/system/small/operations/test_datetimes.py index c5c649c638..936becff76 100644 --- a/tests/system/small/operations/test_datetimes.py +++ b/tests/system/small/operations/test_datetimes.py @@ -14,6 +14,8 @@ import datetime +import numpy +from pandas import testing import pandas as pd import pytest @@ -367,3 +369,82 @@ def test_dt_clip_coerce_str_timestamp(scalars_dfs): pd_result, bf_result, ) + + +@pytest.mark.parametrize("column", ["timestamp_col", "datetime_col"]) +def test_timestamp_diff_two_series(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = (bf_series - bf_series).to_pandas() + + expected_result = pd_series - pd_series + assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize("column", ["timestamp_col", "datetime_col"]) +def test_timestamp_diff_two_series_with_numpy_ops(scalars_dfs, column): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = numpy.subtract(bf_series, bf_series).to_pandas() + + expected_result = numpy.subtract(pd_series, pd_series) + assert_series_equal(actual_result, expected_result) + + +def test_timestamp_diff_two_dataframes(scalars_dfs): + columns = ["timestamp_col", "datetime_col"] + bf_df, pd_df = scalars_dfs + bf_df = bf_df[columns] + pd_df = pd_df[columns] + + actual_result = (bf_df - bf_df).to_pandas() + + expected_result = pd_df - pd_df + testing.assert_frame_equal(actual_result, expected_result) + + +def test_timestamp_diff_two_series_with_different_types_raise_error(scalars_dfs): + df, _ = scalars_dfs + + with pytest.raises(TypeError): + (df["timestamp_col"] - df["datetime_col"]).to_pandas() + + +@pytest.mark.parametrize( + ("column", "value"), + [ + ("timestamp_col", pd.Timestamp("2025-01-01 00:00:01", tz="America/New_York")), + ("datetime_col", datetime.datetime(2025, 1, 1, 0, 0, 1)), + ], +) +def test_timestamp_diff_series_sub_literal(scalars_dfs, column, value): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = (bf_series - value).to_pandas() + + expected_result = pd_series - value + assert_series_equal(actual_result, expected_result) + + +@pytest.mark.parametrize( + ("column", "value"), + [ + ("timestamp_col", pd.Timestamp("2025-01-01 00:00:01", tz="America/New_York")), + ("datetime_col", datetime.datetime(2025, 1, 1, 0, 0, 1)), + ], +) +def test_timestamp_diff_literal_sub_series(scalars_dfs, column, value): + bf_df, pd_df = scalars_dfs + bf_series = bf_df[column] + pd_series = pd_df[column] + + actual_result = (value - bf_series).to_pandas() + + expected_result = value - pd_series + assert_series_equal(actual_result, expected_result) From aec3fe7a986cfcea842383cd71ce1ef7c100f696 Mon Sep 17 00:00:00 2001 From: Huan Chen <142538604+Genesis929@users.noreply.github.com> Date: Wed, 5 Feb 2025 17:45:00 -0800 Subject: [PATCH 06/22] chore: add a tool to upload tpcds data to bigquery. (#1367) * chore: add a tool to upload tpcds data to bigquery. * update error type * update docstring --- scripts/dev-utils/tpcds_upload_helper.py | 597 +++++++++++++++++++++++ 1 file changed, 597 insertions(+) create mode 100644 scripts/dev-utils/tpcds_upload_helper.py diff --git a/scripts/dev-utils/tpcds_upload_helper.py b/scripts/dev-utils/tpcds_upload_helper.py new file mode 100644 index 0000000000..52bb553cd8 --- /dev/null +++ b/scripts/dev-utils/tpcds_upload_helper.py @@ -0,0 +1,597 @@ +import argparse +import csv +import os +import sys + +import google.api_core.exceptions +from google.cloud import bigquery + + +def preprocess_csv(input_file_path, output_file_path): + try: + with open( + input_file_path, mode="r", newline="", encoding="utf-8" + ) as infile, open( + output_file_path, mode="w", newline="", encoding="utf-8" + ) as outfile: + reader = csv.reader(infile, delimiter="|") + writer = csv.writer(outfile, delimiter="|") + + for row in reader: + writer.writerow(row[:-1]) + except Exception as e: + print(f"An error occurred: {e}") + + +def get_schema(table_name): + schema = { + "customer_address": [ + bigquery.SchemaField("ca_address_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ca_address_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("ca_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("ca_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ca_location_type", "STRING", mode="NULLABLE"), + ], + "customer_demographics": [ + bigquery.SchemaField("cd_demo_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cd_gender", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_marital_status", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_education_status", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_purchase_estimate", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cd_credit_rating", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cd_dep_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cd_dep_employed_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cd_dep_college_count", "INTEGER", mode="NULLABLE"), + ], + "date_dim": [ + bigquery.SchemaField("d_date_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("d_date_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("d_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("d_month_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_week_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_quarter_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_year", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_dow", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_moy", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_dom", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_qoy", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_fy_year", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_fy_quarter_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_fy_week_seq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_day_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_quarter_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_holiday", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_weekend", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_following_holiday", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_first_dom", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_last_dom", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_same_day_ly", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_same_day_lq", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("d_current_day", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_week", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_month", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_quarter", "STRING", mode="NULLABLE"), + bigquery.SchemaField("d_current_year", "STRING", mode="NULLABLE"), + ], + "warehouse": [ + bigquery.SchemaField("w_warehouse_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("w_warehouse_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("w_warehouse_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_warehouse_sq_ft", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("w_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("w_gmt_offset", "FLOAT", mode="NULLABLE"), + ], + "ship_mode": [ + bigquery.SchemaField("sm_ship_mode_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("sm_ship_mode_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("sm_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("sm_code", "STRING", mode="NULLABLE"), + bigquery.SchemaField("sm_carrier", "STRING", mode="NULLABLE"), + bigquery.SchemaField("sm_contract", "STRING", mode="NULLABLE"), + ], + "time_dim": [ + bigquery.SchemaField("t_time_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("t_time_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("t_time", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_hour", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_minute", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_second", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("t_am_pm", "STRING", mode="NULLABLE"), + bigquery.SchemaField("t_shift", "STRING", mode="NULLABLE"), + bigquery.SchemaField("t_sub_shift", "STRING", mode="NULLABLE"), + bigquery.SchemaField("t_meal_time", "STRING", mode="NULLABLE"), + ], + "reason": [ + bigquery.SchemaField("r_reason_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("r_reason_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("r_reason_desc", "STRING", mode="NULLABLE"), + ], + "income_band": [ + bigquery.SchemaField("ib_income_band_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ib_lower_bound", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ib_upper_bound", "INTEGER", mode="NULLABLE"), + ], + "item": [ + bigquery.SchemaField("i_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("i_item_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("i_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("i_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("i_item_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_current_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("i_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("i_brand_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_brand", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_class_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_category_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_category", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_manufact_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_manufact", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_size", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_formulation", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_color", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_units", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_container", "STRING", mode="NULLABLE"), + bigquery.SchemaField("i_manager_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("i_product_name", "STRING", mode="NULLABLE"), + ], + "store": [ + bigquery.SchemaField("s_store_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("s_store_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("s_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("s_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("s_closed_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_store_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_number_employees", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_floor_space", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_hours", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_market_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_geography_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_market_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_market_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_division_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_division_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_company_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("s_company_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("s_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("s_tax_precentage", "FLOAT", mode="NULLABLE"), + ], + "call_center": [ + bigquery.SchemaField("cc_call_center_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cc_call_center_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("cc_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("cc_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("cc_closed_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_open_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_employees", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_sq_ft", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_hours", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_mkt_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_mkt_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_mkt_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_market_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_division", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_division_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_company", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cc_company_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cc_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cc_tax_percentage", "FLOAT", mode="NULLABLE"), + ], + "customer": [ + bigquery.SchemaField("c_customer_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("c_customer_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("c_current_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_current_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_current_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_first_shipto_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_first_sales_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_salutation", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_first_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_last_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_preferred_cust_flag", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_birth_day", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_birth_month", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_birth_year", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("c_birth_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_login", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_email_address", "STRING", mode="NULLABLE"), + bigquery.SchemaField("c_last_review_date_sk", "STRING", mode="NULLABLE"), + ], + "web_site": [ + bigquery.SchemaField("web_site_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("web_site_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("web_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("web_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("web_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_open_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_close_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_mkt_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_mkt_class", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_mkt_desc", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_market_manager", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_company_id", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("web_company_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_street_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_street_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_street_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_suite_number", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_city", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_county", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_state", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_zip", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_country", "STRING", mode="NULLABLE"), + bigquery.SchemaField("web_gmt_offset", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("web_tax_percentage", "FLOAT", mode="NULLABLE"), + ], + "store_returns": [ + bigquery.SchemaField("sr_returned_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_return_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("sr_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_store_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_reason_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_ticket_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("sr_return_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("sr_return_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_return_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_return_amt_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_fee", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_return_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_refunded_cash", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_reversed_charge", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_store_credit", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("sr_net_loss", "FLOAT", mode="NULLABLE"), + ], + "household_demographics": [ + bigquery.SchemaField("hd_demo_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("hd_income_band_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("hd_buy_potential", "STRING", mode="NULLABLE"), + bigquery.SchemaField("hd_dep_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("hd_vehicle_count", "INTEGER", mode="NULLABLE"), + ], + "web_page": [ + bigquery.SchemaField("wp_web_page_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("wp_web_page_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("wp_rec_start_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("wp_rec_end_date", "DATE", mode="NULLABLE"), + bigquery.SchemaField("wp_creation_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_access_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_autogen_flag", "STRING", mode="NULLABLE"), + bigquery.SchemaField("wp_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_url", "STRING", mode="NULLABLE"), + bigquery.SchemaField("wp_type", "STRING", mode="NULLABLE"), + bigquery.SchemaField("wp_char_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_link_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_image_count", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wp_max_ad_count", "INTEGER", mode="NULLABLE"), + ], + "promotion": [ + bigquery.SchemaField("p_promo_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("p_promo_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("p_start_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_end_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_item_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("p_response_target", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("p_promo_name", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_dmail", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_email", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_catalog", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_tv", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_radio", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_press", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_event", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_demo", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_channel_details", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_purpose", "STRING", mode="NULLABLE"), + bigquery.SchemaField("p_discount_active", "STRING", mode="NULLABLE"), + ], + "catalog_page": [ + bigquery.SchemaField("cp_catalog_page_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cp_catalog_page_id", "STRING", mode="REQUIRED"), + bigquery.SchemaField("cp_start_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_end_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_department", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cp_catalog_number", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_catalog_page_number", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cp_description", "STRING", mode="NULLABLE"), + bigquery.SchemaField("cp_type", "STRING", mode="NULLABLE"), + ], + "inventory": [ + bigquery.SchemaField("inv_date_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("inv_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("inv_warehouse_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("inv_quantity_on_hand", "INTEGER", mode="NULLABLE"), + ], + "catalog_returns": [ + bigquery.SchemaField("cr_returned_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_returned_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cr_refunded_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField( + "cr_returning_customer_sk", "INTEGER", mode="NULLABLE" + ), + bigquery.SchemaField("cr_returning_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_returning_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_returning_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_call_center_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_catalog_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_ship_mode_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_warehouse_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_reason_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cr_return_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cr_return_amount", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_return_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_return_amt_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_fee", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_return_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_refunded_cash", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_reversed_charge", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_store_credit", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cr_net_loss", "FLOAT", mode="NULLABLE"), + ], + "web_returns": [ + bigquery.SchemaField("wr_returned_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_returned_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("wr_refunded_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField( + "wr_returning_customer_sk", "INTEGER", mode="NULLABLE" + ), + bigquery.SchemaField("wr_returning_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_returning_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_returning_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_web_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_reason_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("wr_return_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("wr_return_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_return_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_return_amt_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_fee", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_return_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_refunded_cash", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_reversed_charge", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_account_credit", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("wr_net_loss", "FLOAT", mode="NULLABLE"), + ], + "web_sales": [ + bigquery.SchemaField("ws_sold_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_sold_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ws_bill_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_bill_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_bill_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_bill_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_web_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_web_site_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_ship_mode_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_warehouse_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_promo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ws_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ws_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_discount_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_coupon_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_ext_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid_inc_ship", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_paid_inc_ship_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ws_net_profit", "FLOAT", mode="NULLABLE"), + ], + "catalog_sales": [ + bigquery.SchemaField("cs_sold_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_sold_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_bill_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_call_center_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_catalog_page_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_ship_mode_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_warehouse_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cs_promo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_order_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("cs_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("cs_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_discount_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_coupon_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_ext_ship_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid_inc_ship", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_paid_inc_ship_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("cs_net_profit", "FLOAT", mode="NULLABLE"), + ], + "store_sales": [ + bigquery.SchemaField("ss_sold_date_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_sold_time_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_item_sk", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ss_customer_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_cdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_hdemo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_addr_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_store_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_promo_sk", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_ticket_number", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("ss_quantity", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ss_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_discount_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_sales_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_wholesale_cost", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_list_price", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_ext_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_coupon_amt", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_net_paid", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_net_paid_inc_tax", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("ss_net_profit", "FLOAT", mode="NULLABLE"), + ], + } + + return schema[table_name] + + +def load_data_to_bigquery(table_name, file_paths, client, dataset_ref, temp_file): + """Loads data from a list of files into a BigQuery table.""" + job_config = bigquery.LoadJobConfig( + source_format=bigquery.SourceFormat.CSV, + skip_leading_rows=0, # No header in .dat files + field_delimiter="|", + schema=get_schema(table_name), + ) + + table_ref = dataset_ref.table(table_name) + table = bigquery.Table(table_ref) + client.create_table(table) + + # Load data from each file + for file_path in sorted(file_paths): + preprocess_csv(file_path, temp_file) + with open(temp_file, "rb") as source_file: + job = client.load_table_from_file( + source_file, table_ref, job_config=job_config + ) + job.result() + print( + f"Loaded data from {file_path} into table {project_id}:{dataset_id}.{table_name}" + ) + + +if __name__ == "__main__": + """ + Loads TPC-DS data to BigQuery. + + This script loads TPC-DS data generated with source code from + https://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp + into BigQuery. + + Note: If the dataset already exists, the script will exit without uploading data. + + Usage: + python tpcds_upload_helper.py --project_id --dataset_id --ds_path + python tpcds_upload_helper.py -d -p -s + """ + parser = argparse.ArgumentParser(description="Load TPC-DS data to BigQuery") + parser.add_argument( + "--project_id", "-p", required=True, help="Google Cloud project ID" + ) + parser.add_argument("--dataset_id", "-d", required=True, help="BigQuery dataset ID") + parser.add_argument( + "--ds_path", "-s", required=True, help="Path to the TPC-DS data directory" + ) + args = parser.parse_args() + + project_id = args.project_id + dataset_id = args.dataset_id + ds_path = args.ds_path + temp_file = "temp.csv" + + # Initialize BigQuery client + client = bigquery.Client(project=project_id) + dataset_ref = client.dataset(dataset_id) + try: + # Quit if dataset exists + client.get_dataset(dataset_ref) + print(f"Dataset {project_id}:{dataset_id} already exists. Skipping.") + sys.exit(1) + except google.api_core.exceptions.NotFound: + # Create the dataset if it doesn't exist + dataset = bigquery.Dataset(dataset_ref) + client.create_dataset(dataset) + print(f"Created dataset {project_id}:{dataset_id}") + + # Iterate through the folders + for table_name in sorted(os.listdir(ds_path)): + table_path = os.path.join(ds_path, table_name) + table_name = table_name.split(".")[0] + if os.path.isdir(table_path): + file_paths = [ + os.path.join(table_path, f) + for f in os.listdir(table_path) + if f.endswith(".dat") + ] + load_data_to_bigquery( + table_name, file_paths, client, dataset_ref, temp_file + ) + + try: + os.remove(temp_file) + print("Removed temporary file: temp.csv") + except FileNotFoundError: + print("Temporary file not found.") From c188f49d8178a0fefbb2a8dddbb5d5fe02b827e9 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 6 Feb 2025 10:02:52 -0800 Subject: [PATCH 07/22] chore: add experimental Multimodal support in Gemini (#1368) * chore: add experimental Multimodal support in Gemini * fix * warning --- bigframes/dataframe.py | 4 +-- bigframes/ml/llm.py | 57 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 6308dcc8da..20f636b681 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -84,7 +84,7 @@ import bigframes.session - SingleItemValue = Union[bigframes.series.Series, int, float, Callable] + SingleItemValue = Union[bigframes.series.Series, int, float, str, Callable] LevelType = typing.Hashable LevelsType = typing.Union[LevelType, typing.Sequence[LevelType]] @@ -1953,7 +1953,7 @@ def _assign_single_item_listlike(self, k: str, v: Sequence) -> DataFrame: result_block = result_block.drop_columns([src_col]) return DataFrame(result_block) - def _assign_scalar(self, label: str, value: Union[int, float]) -> DataFrame: + def _assign_scalar(self, label: str, value: Union[int, float, str]) -> DataFrame: col_ids = self._block.cols_matching_label(label) block, constant_col_id = self._block.create_constant(value, label) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index d2e97a7608..7b66191a11 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,17 +16,19 @@ from __future__ import annotations -from typing import Callable, cast, Literal, Mapping, Optional +from typing import Callable, cast, Iterable, Literal, Mapping, Optional, Union import warnings import bigframes_vendored.constants as constants from google.cloud import bigquery import typing_extensions -from bigframes import clients, exceptions +from bigframes import clients, dtypes, exceptions +import bigframes.bigquery as bbq from bigframes.core import blocks, global_session, log_adapter import bigframes.dataframe from bigframes.ml import base, core, globals, utils +import bigframes.series _BQML_PARAMS_MAPPING = { "max_iterations": "maxIterations", @@ -83,6 +85,13 @@ _GEMINI_1P5_PRO_002_ENDPOINT, _GEMINI_1P5_FLASH_002_ENDPOINT, ) +_GEMINI_MULTIMODAL_ENDPOINTS = ( + _GEMINI_1P5_PRO_001_ENDPOINT, + _GEMINI_1P5_PRO_002_ENDPOINT, + _GEMINI_1P5_FLASH_001_ENDPOINT, + _GEMINI_1P5_FLASH_002_ENDPOINT, + _GEMINI_2_FLASH_EXP_ENDPOINT, +) _CLAUDE_3_SONNET_ENDPOINT = "claude-3-sonnet" _CLAUDE_3_HAIKU_ENDPOINT = "claude-3-haiku" @@ -925,12 +934,13 @@ def predict( top_p: float = 1.0, ground_with_google_search: bool = False, max_retries: int = 0, + prompt: Optional[Iterable[Union[str, bigframes.series.Series]]] = None, ) -> bigframes.dataframe.DataFrame: """Predict the result from input DataFrame. Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): - Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "prompt" column for prediction. + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, the "prompt" column, or created by "prompt" parameter, is used for prediction. Prompts can include preamble, questions, suggestions, instructions, or examples. temperature (float, default 0.9): @@ -966,6 +976,14 @@ def predict( max_retries (int, default 0): Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. Each retry will append newly succeeded rows. When the max retries are reached, the remaining rows (the ones without successful predictions) will be appended to the end of the result. + + prompt (Iterable of str or bigframes.series.Series, or None, default None): + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Construct a prompt struct column for prediction based on the input. The input must be an Iterable that can take string literals, + such as "summarize", string column(s) of X, such as X["str_col"], or blob column(s) of X, such as X["blob_col"]. + It creates a struct column of the items of the iterable, and use the concatenated result as the input prompt. No-op if set to None. Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. """ @@ -990,7 +1008,38 @@ def predict( f"max_retries must be larger than or equal to 0, but is {max_retries}." ) - (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + session = self._bqml_model.session + (X,) = utils.batch_convert_to_dataframe(X, session=session) + + if prompt: + if not bigframes.options.experiments.blob: + raise NotImplementedError() + + if self.model_name not in _GEMINI_MULTIMODAL_ENDPOINTS: + raise NotImplementedError( + f"GeminiTextGenerator only supports model_name {', '.join(_GEMINI_MULTIMODAL_ENDPOINTS)} for Multimodal prompt." + ) + + df_prompt = X[[X.columns[0]]].rename( + columns={X.columns[0]: "bigframes_placeholder_col"} + ) + for i, item in enumerate(prompt): + # must be distinct str column labels to construct a struct + if isinstance(item, str): + label = f"input_{i}" + else: # Series + label = f"input_{i}_{item.name}" + + # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input + if ( + isinstance(item, bigframes.series.Series) + and item.dtype == dtypes.OBJ_REF_DTYPE + ): + item = item.blob._get_runtime("R", with_metadata=True) + + df_prompt[label] = item + df_prompt = df_prompt.drop(columns="bigframes_placeholder_col") + X["prompt"] = bbq.struct(df_prompt) if len(X.columns) == 1: # BQML identified the column by name From 19700316ce292b77f413b19889664a015b31d60c Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 6 Feb 2025 11:26:58 -0800 Subject: [PATCH 08/22] chore: Add __init__.py to functions test modules (#1371) --- tests/system/large/functions/__init__.py | 13 +++++++++++++ tests/system/small/functions/__init__.py | 13 +++++++++++++ tests/unit/functions/__init__.py | 13 +++++++++++++ 3 files changed, 39 insertions(+) create mode 100644 tests/system/large/functions/__init__.py create mode 100644 tests/system/small/functions/__init__.py create mode 100644 tests/unit/functions/__init__.py diff --git a/tests/system/large/functions/__init__.py b/tests/system/large/functions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/large/functions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/system/small/functions/__init__.py b/tests/system/small/functions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/system/small/functions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/unit/functions/__init__.py b/tests/unit/functions/__init__.py new file mode 100644 index 0000000000..0a2669d7a2 --- /dev/null +++ b/tests/unit/functions/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From 3c3e14c715f476ca44f254c0d53d639ea5988a8d Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Thu, 6 Feb 2025 13:47:59 -0600 Subject: [PATCH 09/22] feat: add `GeoSeries.from_xy()` (#1364) * feat: add GeoSeries.from_xy * add from_xy test and update ibis types * update geoseries notebook with from_xy * Update docstring example * fix doctstring lint error * return GeometryDtype() for all ibis geo types * chore: support timestamp subtractions (#1346) * chore: support timestamp subtractions * Fix format * use tree rewrites to dispatch timestamp_diff operator * add TODO for more node updates * polish the code and fix typos * fix comment * add rewrites to compile_raw and compile_peek_sql * chore: add a tool to upload tpcds data to bigquery. (#1367) * chore: add a tool to upload tpcds data to bigquery. * update error type * update docstring --------- Co-authored-by: Shenyang Cai Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- bigframes/core/compile/ibis_types.py | 15 ++- bigframes/core/compile/scalar_op_compiler.py | 7 + bigframes/geopandas/geoseries.py | 9 ++ bigframes/operations/__init__.py | 8 +- bigframes/operations/geo_ops.py | 4 + bigframes/operations/type.py | 14 ++ notebooks/geo/geoseries.ipynb | 125 ++++++++++++------ .../system/small/geopandas/test_geoseries.py | 20 +++ .../bigframes_vendored/geopandas/geoseries.py | 42 ++++++ 9 files changed, 199 insertions(+), 45 deletions(-) diff --git a/bigframes/core/compile/ibis_types.py b/bigframes/core/compile/ibis_types.py index 78c2259cf0..af2b7908ad 100644 --- a/bigframes/core/compile/ibis_types.py +++ b/bigframes/core/compile/ibis_types.py @@ -47,6 +47,8 @@ ibis_dtypes.JSON, ] +IBIS_GEO_TYPE = ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True) + BIDIRECTIONAL_MAPPINGS: Iterable[Tuple[IbisDtype, bigframes.dtypes.Dtype]] = ( (ibis_dtypes.boolean, pd.BooleanDtype()), @@ -70,7 +72,7 @@ pd.ArrowDtype(pa.decimal256(76, 38)), ), ( - ibis_dtypes.GeoSpatial(geotype="geography", srid=4326, nullable=True), + IBIS_GEO_TYPE, gpd.array.GeometryDtype(), ), (ibis_dtypes.json, db_dtypes.JSONDtype()), @@ -177,6 +179,14 @@ def cast_ibis_value( ibis_dtypes.timestamp, ), ibis_dtypes.binary: (ibis_dtypes.string,), + ibis_dtypes.point: (IBIS_GEO_TYPE,), + ibis_dtypes.geometry: (IBIS_GEO_TYPE,), + ibis_dtypes.geography: (IBIS_GEO_TYPE,), + ibis_dtypes.linestring: (IBIS_GEO_TYPE,), + ibis_dtypes.polygon: (IBIS_GEO_TYPE,), + ibis_dtypes.multilinestring: (IBIS_GEO_TYPE,), + ibis_dtypes.multipoint: (IBIS_GEO_TYPE,), + ibis_dtypes.multipolygon: (IBIS_GEO_TYPE,), } value = ibis_value_to_canonical_type(value) @@ -282,6 +292,9 @@ def ibis_dtype_to_bigframes_dtype( if isinstance(ibis_dtype, ibis_dtypes.JSON): return bigframes.dtypes.JSON_DTYPE + if isinstance(ibis_dtype, ibis_dtypes.GeoSpatial): + return gpd.array.GeometryDtype() + if ibis_dtype in IBIS_TO_BIGFRAMES: return IBIS_TO_BIGFRAMES[ibis_dtype] elif isinstance(ibis_dtype, ibis_dtypes.Decimal): diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 4739cc9a99..ea642c20fd 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1003,6 +1003,13 @@ def geo_area_op_impl(x: ibis_types.Value): return typing.cast(ibis_types.GeoSpatialValue, x).area() +@scalar_op_compiler.register_binary_op(ops.geo_st_geogpoint_op, pass_op=False) +def geo_st_geogpoint_op_impl(x: ibis_types.Value, y: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).point( + typing.cast(ibis_types.NumericValue, y) + ) + + # Parameterized ops @scalar_op_compiler.register_unary_op(ops.StructFieldOp, pass_op=True) def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp): diff --git a/bigframes/geopandas/geoseries.py b/bigframes/geopandas/geoseries.py index bc0482f60d..b757e2b971 100644 --- a/bigframes/geopandas/geoseries.py +++ b/bigframes/geopandas/geoseries.py @@ -66,3 +66,12 @@ def area(self, crs=None) -> bigframes.series.Series: # type: ignore raise NotImplementedError( f"GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. {constants.FEEDBACK_LINK}" ) + + @classmethod + def from_xy(cls, x, y, index=None, session=None, **kwargs) -> GeoSeries: + # TODO: if either x or y is local and the other is remote. Use the + # session from the remote object. + series_x = bigframes.series.Series(x, index=index, session=session, **kwargs) + series_y = bigframes.series.Series(y, index=index, session=session, **kwargs) + + return cls(series_x._apply_binary_op(series_y, ops.geo_st_geogpoint_op)) diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index ba8f3f64d7..d35fa2c5c2 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -85,7 +85,12 @@ SqlScalarOp, where_op, ) -from bigframes.operations.geo_ops import geo_area_op, geo_x_op, geo_y_op +from bigframes.operations.geo_ops import ( + geo_area_op, + geo_st_geogpoint_op, + geo_x_op, + geo_y_op, +) from bigframes.operations.json_ops import ( JSONExtract, JSONExtractArray, @@ -337,6 +342,7 @@ "geo_x_op", "geo_y_op", "geo_area_op", + "geo_st_geogpoint_op", # Numpy ops mapping "NUMPY_TO_BINOP", "NUMPY_TO_OP", diff --git a/bigframes/operations/geo_ops.py b/bigframes/operations/geo_ops.py index bc14fa611b..0ae8accd56 100644 --- a/bigframes/operations/geo_ops.py +++ b/bigframes/operations/geo_ops.py @@ -36,3 +36,7 @@ dtypes.is_geo_like, dtypes.FLOAT_DTYPE, description="geo-like" ), ) + +geo_st_geogpoint_op = base_ops.create_binary_op( + name="geo_st_geogpoint", type_signature=op_typing.BinaryNumericGeo() +) diff --git a/bigframes/operations/type.py b/bigframes/operations/type.py index 441134aff5..86bb56fc39 100644 --- a/bigframes/operations/type.py +++ b/bigframes/operations/type.py @@ -121,6 +121,20 @@ def output_type( return bigframes.dtypes.coerce_to_common(left_type, right_type) +@dataclasses.dataclass +class BinaryNumericGeo(BinaryTypeSignature): + """Type signature for geo functions like from_xy that can map ints to ints.""" + + def output_type( + self, left_type: ExpressionType, right_type: ExpressionType + ) -> ExpressionType: + if (left_type is not None) and not bigframes.dtypes.is_numeric(left_type): + raise TypeError(f"Type {left_type} is not numeric") + if (right_type is not None) and not bigframes.dtypes.is_numeric(right_type): + raise TypeError(f"Type {right_type} is not numeric") + return bigframes.dtypes.GEO_DTYPE + + @dataclasses.dataclass class BinaryRealNumeric(BinaryTypeSignature): """Type signature for real-valued functions like divide, arctan2, pow.""" diff --git a/notebooks/geo/geoseries.ipynb b/notebooks/geo/geoseries.ipynb index 7dc4c596ca..4792c4fe27 100644 --- a/notebooks/geo/geoseries.ipynb +++ b/notebooks/geo/geoseries.ipynb @@ -44,7 +44,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Load the Counties table from the Census Bureau US Boundaries dataset" + "### 1. Load the Counties table from the Census Bureau US Boundaries dataset" ] }, { @@ -56,7 +56,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/arwas/src1/python-bigquery-dataframes/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", + "/usr/local/google/home/arwas/src/bigframes3/bigframes/session/_io/bigquery/read_gbq_table.py:280: DefaultIndexWarning: Table 'bigquery-public-data.geo_us_boundaries.counties' is clustered and/or partitioned, but BigQuery DataFrames was not able to find a suitable index. To avoid this warning, set at least one of: `index_col` or `filters`.\n", " warnings.warn(msg, category=bfe.DefaultIndexWarning)\n" ] } @@ -69,7 +69,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Create a series from the int_point_geom column" + "### 2. Create a series from the int_point_geom column" ] }, { @@ -103,11 +103,11 @@ { "data": { "text/plain": [ - "37 POINT (-91.19496 39.98605)\n", - "406 POINT (-84.86717 33.92103)\n", - "926 POINT (-82.47974 35.33641)\n", - "940 POINT (-75.50298 39.09709)\n", - "996 POINT (-92.56434 39.8298)\n", + "171 POINT (-95.50742 42.39186)\n", + "219 POINT (-105.42894 37.27755)\n", + "402 POINT (-93.34905 32.10121)\n", + "526 POINT (-84.60469 43.29233)\n", + "677 POINT (-89.5681 37.04779)\n", "Name: int_point_geom, dtype: geometry" ] }, @@ -136,11 +136,11 @@ { "data": { "text/plain": [ - "0 POINT (-91.19496 39.98605)\n", - "1 POINT (-84.86717 33.92103)\n", - "2 POINT (-82.47974 35.33641)\n", - "3 POINT (-75.50298 39.09709)\n", - "4 POINT (-92.56434 39.8298)\n", + "0 POINT (-95.50742 42.39186)\n", + "1 POINT (-105.42894 37.27755)\n", + "2 POINT (-93.34905 32.10121)\n", + "3 POINT (-84.60469 43.29233)\n", + "4 POINT (-89.5681 37.04779)\n", "dtype: geometry" ] }, @@ -185,11 +185,11 @@ { "data": { "text/plain": [ - "0 -91.194961\n", - "1 -84.867169\n", - "2 -82.479741\n", - "3 -75.502982\n", - "4 -92.56434\n", + "0 -95.507421\n", + "1 -105.42894\n", + "2 -93.34905\n", + "3 -84.60469\n", + "4 -89.568097\n", "dtype: Float64" ] }, @@ -217,11 +217,11 @@ { "data": { "text/plain": [ - "0 39.986053\n", - "1 33.92103\n", - "2 35.336415\n", - "3 39.097088\n", - "4 39.829795\n", + "0 42.39186\n", + "1 37.277547\n", + "2 32.101213\n", + "3 43.292326\n", + "4 37.047793\n", "dtype: Float64" ] }, @@ -367,11 +367,11 @@ { "data": { "text/plain": [ - "10 POLYGON ((-101.7778 40.34969, -101.77812 40.34...\n", - "127 POLYGON ((-89.22333 44.50398, -89.22334 44.499...\n", - "253 POLYGON ((-76.69446 37.07288, -76.69515 37.072...\n", - "261 POLYGON ((-98.70136 44.45055, -98.70136 44.450...\n", - "303 POLYGON ((-85.99565 30.28131, -85.99566 30.280...\n", + "54 POLYGON ((-93.76575 45.06448, -93.76575 45.064...\n", + "256 POLYGON ((-89.83723 42.68318, -89.83732 42.682...\n", + "266 POLYGON ((-104.19381 39.56523, -104.19464 39.5...\n", + "485 MULTIPOLYGON (((-91.05884 32.17233, -91.05891 ...\n", + "765 POLYGON ((-83.61848 38.1557, -83.61861 38.1554...\n", "Name: county_geom, dtype: geometry" ] }, @@ -389,7 +389,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Convert the geometry collection to `bigframes.gopandas.GeoSeries`" + "### 2. Convert the geometry collection to `bigframes.gopandas.GeoSeries`" ] }, { @@ -400,11 +400,11 @@ { "data": { "text/plain": [ - "0 POLYGON ((-101.7778 40.34969, -101.77812 40.34...\n", - "1 POLYGON ((-89.22333 44.50398, -89.22334 44.499...\n", - "2 POLYGON ((-76.69446 37.07288, -76.69515 37.072...\n", - "3 POLYGON ((-98.70136 44.45055, -98.70136 44.450...\n", - "4 POLYGON ((-85.99565 30.28131, -85.99566 30.280...\n", + "0 POLYGON ((-93.76575 45.06448, -93.76575 45.064...\n", + "1 POLYGON ((-89.83723 42.68318, -89.83732 42.682...\n", + "2 POLYGON ((-104.19381 39.56523, -104.19464 39.5...\n", + "3 MULTIPOLYGON (((-91.05884 32.17233, -91.05891 ...\n", + "4 POLYGON ((-83.61848 38.1557, -83.61861 38.1554...\n", "dtype: geometry" ] }, @@ -442,14 +442,14 @@ "outputs": [ { "ename": "NotImplementedError", - "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.34.0", + "evalue": "GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.35.0", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNotImplementedError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[13], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mfive_geom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea\u001b[49m\n", - "File \u001b[0;32m~/src1/python-bigquery-dataframes/bigframes/geopandas/geoseries.py:66\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 68\u001b[0m )\n", - "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.34.0" + "File \u001b[0;32m~/src/bigframes3/bigframes/geopandas/geoseries.py:66\u001b[0m, in \u001b[0;36mGeoSeries.area\u001b[0;34m(self, crs)\u001b[0m\n\u001b[1;32m 47\u001b[0m \u001b[38;5;129m@property\u001b[39m\n\u001b[1;32m 48\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21marea\u001b[39m(\u001b[38;5;28mself\u001b[39m, crs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m bigframes\u001b[38;5;241m.\u001b[39mseries\u001b[38;5;241m.\u001b[39mSeries: \u001b[38;5;66;03m# type: ignore\u001b[39;00m\n\u001b[1;32m 49\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Returns a Series containing the area of each geometry in the GeoSeries\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;124;03m expressed in the units of the CRS.\u001b[39;00m\n\u001b[1;32m 51\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[38;5;124;03m GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), insetead.\u001b[39;00m\n\u001b[1;32m 65\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m---> 66\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m(\n\u001b[1;32m 67\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mconstants\u001b[38;5;241m.\u001b[39mFEEDBACK_LINK\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 68\u001b[0m )\n", + "\u001b[0;31mNotImplementedError\u001b[0m: GeoSeries.area is not supported. Use bigframes.bigquery.st_area(series), instead. Share your usecase with the BigQuery DataFrames team at the https://bit.ly/bigframes-feedback survey.You are currently running BigFrames version 1.35.0" ] } ], @@ -461,7 +461,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Use `bigframes.bigquery.st_area` to retirive the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" + "### 3. Use `bigframes.bigquery.st_area` to retirive the `area` in square meters instead. See: https://cloud.google.com/bigquery/docs/reference/standard-sql/geography_functions#st_area" ] }, { @@ -481,11 +481,11 @@ { "data": { "text/plain": [ - "0 2382382043.48891\n", - "1 1977633097.26862\n", - "2 939388839.499466\n", - "3 3269015229.381782\n", - "4 2678752241.321673\n", + "0 1567505274.453911\n", + "1 1511436852.079554\n", + "2 4789800692.948824\n", + "3 1686877416.586061\n", + "4 740944862.916908\n", "dtype: Float64" ] }, @@ -498,6 +498,45 @@ "geom_area = bbq.st_area(five_geom)\n", "geom_area" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Use `bigframes.geopandas.GeoSeries.from_xy()` to create a GeoSeries of `Point` geometries. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Reuse the `geo_points.x` and `geo_points.y` results by passing them to `.from_xy()` " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 POINT (-95.50742 42.39186)\n", + "1 POINT (-105.42894 37.27755)\n", + "2 POINT (-93.34905 32.10121)\n", + "3 POINT (-84.60469 43.29233)\n", + "4 POINT (-89.5681 37.04779)\n", + "dtype: geometry" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bigframes.geopandas.GeoSeries.from_xy(geo_points.x, geo_points.y)" + ] } ], "metadata": { diff --git a/tests/system/small/geopandas/test_geoseries.py b/tests/system/small/geopandas/test_geoseries.py index 2967e4d247..5951d0b12c 100644 --- a/tests/system/small/geopandas/test_geoseries.py +++ b/tests/system/small/geopandas/test_geoseries.py @@ -87,3 +87,23 @@ def test_geo_area_not_supported(): ), ): bf_series.area + + +def test_geo_from_xy(): + x = [2.5, 5, -3.0] + y = [0.5, 1, 1.5] + bf_result = ( + bigframes.geopandas.GeoSeries.from_xy(x, y) + .astype(geopandas.array.GeometryDtype()) + .to_pandas() + ) + pd_result = geopandas.GeoSeries.from_xy(x, y, crs="EPSG:4326").astype( + geopandas.array.GeometryDtype() + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + check_series_type=False, + check_index=False, + ) diff --git a/third_party/bigframes_vendored/geopandas/geoseries.py b/third_party/bigframes_vendored/geopandas/geoseries.py index d84dec94a8..b8a7af437b 100644 --- a/third_party/bigframes_vendored/geopandas/geoseries.py +++ b/third_party/bigframes_vendored/geopandas/geoseries.py @@ -90,3 +90,45 @@ def y(self) -> bigframes.series.Series: Return the y location (latitude) of point geometries. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @classmethod + def from_xy(cls, x, y, index=None, **kwargs) -> bigframes.geopandas.GeoSeries: + """ + Alternate constructor to create a GeoSeries of Point geometries from + lists or arrays of x, y coordinates. + + In case of geographic coordinates, it is assumed that longitude is + captured by x coordinates and latitude by y. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.geopandas + >>> bpd.options.display.progress_bar = None + + >>> x = [2.5, 5, -3.0] + >>> y = [0.5, 1, 1.5] + + >>> s = bigframes.geopandas.GeoSeries.from_xy(x, y) + >>> s + 0 POINT (2.5 0.5) + 1 POINT (5 1) + 2 POINT (-3 1.5) + dtype: geometry + + Args: + x, y (array-like): + longitude is x coordinates and latitude y coordinates. + + index (array-like or Index, optional): + The index for the GeoSeries. If not given and all coordinate + inputs are Series with an equal index, that index is used.. + + **kwargs: + Additional arguments passed to the Series constructor, e.g. `name`. + + Returns: + bigframes.geopandas.GeoSeries: + A GeoSeries of Point geometries. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 65933b6b7608ec52717e818d8ec1732fb756b67b Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 11:59:21 -0800 Subject: [PATCH 10/22] chore: support casting floats and list-likes to timedelta series (#1362) * chore: support casting floats and list-likes to timedelta series * use singledispatch to handle different input types * fix format * stop using singledispatch due to compatibility issues --- bigframes/operations/timedelta_ops.py | 6 ++-- bigframes/pandas/core/tools/timedeltas.py | 20 +++++++----- tests/system/small/test_pandas.py | 38 ++++++++++++++++++++++- 3 files changed, 53 insertions(+), 11 deletions(-) diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index e212381557..f5b82c2331 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -26,6 +26,6 @@ class ToTimedeltaOp(base_ops.UnaryOp): unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] def output_type(self, *input_types): - if input_types[0] is not dtypes.INT_DTYPE: - raise TypeError("expected integer input") - return dtypes.TIMEDELTA_DTYPE + if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): + return dtypes.TIMEDELTA_DTYPE + raise TypeError("expected integer or float input") diff --git a/bigframes/pandas/core/tools/timedeltas.py b/bigframes/pandas/core/tools/timedeltas.py index 0cedf425fe..070a41d62d 100644 --- a/bigframes/pandas/core/tools/timedeltas.py +++ b/bigframes/pandas/core/tools/timedeltas.py @@ -18,20 +18,26 @@ timedeltas as vendored_pandas_timedeltas, ) import pandas as pd +import pandas.api.types as pdtypes from bigframes import operations as ops -from bigframes import series +from bigframes import series, session def to_timedelta( - arg: typing.Union[series.Series, str, int, float], + arg, unit: typing.Optional[vendored_pandas_timedeltas.UnitChoices] = None, -) -> typing.Union[series.Series, pd.Timedelta]: - if not isinstance(arg, series.Series): - return pd.to_timedelta(arg, unit) + *, + session: typing.Optional[session.Session] = None, +): + if isinstance(arg, series.Series): + canonical_unit = "us" if unit is None else _canonicalize_unit(unit) + return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) - canonical_unit = "us" if unit is None else _canonicalize_unit(unit) - return arg._apply_unary_op(ops.ToTimedeltaOp(canonical_unit)) + if pdtypes.is_list_like(arg): + return to_timedelta(series.Series(arg), unit, session=session) + + return pd.to_timedelta(arg, unit) to_timedelta.__doc__ = vendored_pandas_timedeltas.to_timedelta.__doc__ diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index e46d073056..4b4264e33c 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -763,7 +763,7 @@ def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc): "micros", ], ) -def test_to_timedelta_with_bf_series(session, unit): +def test_to_timedelta_with_bf_integer_series(session, unit): bf_series = bpd.Series([1, 2, 3], session=session) pd_series = pd.Series([1, 2, 3]) @@ -779,6 +779,42 @@ def test_to_timedelta_with_bf_series(session, unit): ) +def test_to_timedelta_with_bf_float_series_value_rounded_down(session): + bf_series = bpd.Series([1.2, 2.9], session=session) + + actual_result = ( + typing.cast(bpd.Series, bpd.to_timedelta(bf_series, "us")) + .to_pandas() + .astype("timedelta64[ns]") + ) + + expected_result = pd.Series([pd.Timedelta(1, "us"), pd.Timedelta(2, "us")]) + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + +@pytest.mark.parametrize( + "input", + [ + pytest.param([1, 2, 3], id="list"), + pytest.param((1, 2, 3), id="tuple"), + pytest.param(pd.Series([1, 2, 3]), id="pandas-series"), + ], +) +def test_to_timedelta_with_list_like_input(session, input): + actual_result = ( + typing.cast(bpd.Series, bpd.to_timedelta(input, "s", session=session)) + .to_pandas() + .astype("timedelta64[ns]") + ) + + expected_result = pd.Series(pd.to_timedelta(input, "s")) + pd.testing.assert_series_equal( + actual_result, expected_result, check_index_type=False + ) + + @pytest.mark.parametrize( "unit", ["Y", "M", "whatever"], From 269f4b49394d99dac1c9a1698bcef899851375b0 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Thu, 6 Feb 2025 13:04:48 -0800 Subject: [PATCH 11/22] chore: adjust StreamingDataFrame impl for the upcoming required APPENDS clause (#1366) --- bigframes/streaming/dataframe.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/bigframes/streaming/dataframe.py b/bigframes/streaming/dataframe.py index 90c638b82e..2180a66207 100644 --- a/bigframes/streaming/dataframe.py +++ b/bigframes/streaming/dataframe.py @@ -24,7 +24,7 @@ from google.cloud import bigquery from bigframes import dataframe -from bigframes.core import log_adapter +from bigframes.core import log_adapter, nodes import bigframes.exceptions as bfe import bigframes.session @@ -54,7 +54,7 @@ def _curate_df_doc(doc: Optional[str]): class StreamingBase: - sql: str + _appends_sql: str _session: bigframes.session.Session def to_bigtable( @@ -124,7 +124,7 @@ def to_bigtable( can be examined. """ return _to_bigtable( - self.sql, + self._appends_sql, instance=instance, table=table, service_account_email=service_account_email, @@ -181,7 +181,7 @@ def to_pubsub( can be examined. """ return _to_pubsub( - self.sql, + self._appends_sql, topic=topic, service_account_email=service_account_email, session=self._session, @@ -218,6 +218,19 @@ def __init__(self, df: dataframe.DataFrame, *, create_key=0): def _from_table_df(cls, df: dataframe.DataFrame) -> StreamingDataFrame: return cls(df, create_key=cls._create_key) + @property + def _original_table(self): + def traverse(node: nodes.BigFrameNode): + if isinstance(node, nodes.ReadTableNode): + return f"{node.source.table.project_id}.{node.source.table.dataset_id}.{node.source.table.table_id}" + for child in node.child_nodes: + original_table = traverse(child) + if original_table: + return original_table + return None + + return traverse(self._df._block._expr.node) + def __getitem__(self, *args, **kwargs): return _return_type_wrapper(self._df.__getitem__, StreamingDataFrame)( *args, **kwargs @@ -266,6 +279,17 @@ def sql(self): sql.__doc__ = _curate_df_doc(inspect.getdoc(dataframe.DataFrame.sql)) + # Patch for the required APPENDS clause + @property + def _appends_sql(self): + sql_str = self.sql + original_table = self._original_table + assert original_table is not None + + appends_clause = f"APPENDS(TABLE `{original_table}`, NULL, NULL)" + sql_str = sql_str.replace(f"`{original_table}`", appends_clause) + return sql_str + @property def _session(self): return self._df._session From 0c55b07dc001b568875f06d578ca7d59409f2a11 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Thu, 6 Feb 2025 14:16:27 -0800 Subject: [PATCH 12/22] fix: translate labels to col ids when copying dataframes (#1372) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: translate labels to col ids when copying dataframes * polish error message * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * polish doc --------- Co-authored-by: Owl Bot --- bigframes/core/blocks.py | 20 ++++++++++++++++++++ bigframes/dataframe.py | 15 +++++---------- tests/system/small/test_dataframe.py | 11 +++++++++-- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index b1f4ed35cc..c6e3096e51 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -276,6 +276,26 @@ def label_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: mapping[label] = (*mapping.get(label, ()), id) return mapping + def resolve_label_exact(self, label: Label) -> Optional[str]: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such a column, returns None.""" + matches = self.label_to_col_id.get(label, []) + if len(matches) > 1: + raise ValueError( + f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" + ) + return matches[0] if len(matches) != 0 else None + + def resolve_label_exact_or_error(self, label: Label) -> str: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such a column, raises an error too.""" + col_id = self.resolve_label_exact(label) + if col_id is None: + raise ValueError(f"Label {label} not found. {constants.FEEDBACK_LINK}") + return col_id + @functools.cached_property def col_id_to_index_name(self) -> typing.Mapping[str, Label]: """Get column label for value columns, or index name for index columns""" diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 20f636b681..4ffa56c2e5 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -180,7 +180,10 @@ def __init__( ) block = block.set_index([r_mapping[idx_col] for idx_col in idx_cols]) if columns: - block = block.select_columns(list(columns)) # type:ignore + column_ids = [ + block.resolve_label_exact_or_error(label) for label in list(columns) + ] + block = block.select_columns(column_ids) # type:ignore if dtype: bf_dtype = bigframes.dtypes.bigframes_type(dtype) block = block.multi_apply_unary_op(ops.AsTypeOp(to_type=bf_dtype)) @@ -238,15 +241,7 @@ def _find_indices( return [self._block.value_columns.index(col_id) for col_id in col_ids] def _resolve_label_exact(self, label) -> Optional[str]: - """Returns the column id matching the label if there is exactly - one such column. If there are multiple columns with the same name, - raises an error. If there is no such column, returns None.""" - matches = self._block.label_to_col_id.get(label, []) - if len(matches) > 1: - raise ValueError( - f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" - ) - return matches[0] if len(matches) != 0 else None + return self._block.resolve_label_exact(label) def _sql_names( self, diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index e7556043af..1db89a074a 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -44,8 +44,15 @@ def test_df_construct_copy(scalars_dfs): columns = ["int64_col", "string_col", "float64_col"] scalars_df, scalars_pandas_df = scalars_dfs - bf_result = dataframe.DataFrame(scalars_df, columns=columns).to_pandas() - pd_result = pd.DataFrame(scalars_pandas_df, columns=columns) + # Make the mapping from label to col_id non-trivial + bf_df = scalars_df.copy() + bf_df["int64_col"] = bf_df["int64_col"] / 2 + pd_df = scalars_pandas_df.copy() + pd_df["int64_col"] = pd_df["int64_col"] / 2 + + bf_result = dataframe.DataFrame(bf_df, columns=columns).to_pandas() + + pd_result = pd.DataFrame(pd_df, columns=columns) pandas.testing.assert_frame_equal(bf_result, pd_result) From 923da037ef6e4e7f8b54924ea5644c2c5ceb2234 Mon Sep 17 00:00:00 2001 From: TrevorBergeron Date: Thu, 6 Feb 2025 15:33:57 -0800 Subject: [PATCH 13/22] perf: Prune unused operations from sql (#1365) --- bigframes/core/__init__.py | 24 +- bigframes/core/compile/compiled.py | 2 +- bigframes/core/compile/compiler.py | 22 +- bigframes/core/nodes.py | 423 +++++++++--------- bigframes/core/rewrite/__init__.py | 2 + bigframes/core/rewrite/implicit_align.py | 40 +- bigframes/core/rewrite/legacy_align.py | 6 +- bigframes/core/rewrite/order.py | 16 +- bigframes/core/rewrite/pruning.py | 195 ++++++++ bigframes/core/rewrite/slices.py | 4 +- .../ibis/backends/sql/rewrites.py | 2 +- .../bigframes_vendored/ibis/common/graph.py | 5 + 12 files changed, 489 insertions(+), 252 deletions(-) create mode 100644 bigframes/core/rewrite/pruning.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 5f64bf68dd..dc9b8e3b9b 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -304,18 +304,25 @@ def assign(self, source_id: str, destination_id: str) -> ArrayValue: if destination_id in self.column_ids: # Mutate case exprs = [ ( - ex.deref(source_id if (col_id == destination_id) else col_id), - ids.ColumnId(col_id), + bigframes.core.nodes.AliasedRef( + ex.deref(source_id if (col_id == destination_id) else col_id), + ids.ColumnId(col_id), + ) ) for col_id in self.column_ids ] else: # append case self_projection = ( - (ex.deref(col_id), ids.ColumnId(col_id)) for col_id in self.column_ids + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in self.column_ids ) exprs = [ *self_projection, - (ex.deref(source_id), ids.ColumnId(destination_id)), + ( + bigframes.core.nodes.AliasedRef( + ex.deref(source_id), ids.ColumnId(destination_id) + ) + ), ] return ArrayValue( nodes.SelectionNode( @@ -337,7 +344,10 @@ def create_constant( def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ((ex.deref(col_id), ids.ColumnId(col_id)) for col_id in column_ids) + selections = ( + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in column_ids + ) return ArrayValue( nodes.SelectionNode( child=self.node, @@ -488,7 +498,9 @@ def prepare_join_names( nodes.SelectionNode( other.node, tuple( - (ex.deref(old_id), ids.ColumnId(new_id)) + bigframes.core.nodes.AliasedRef( + ex.deref(old_id), ids.ColumnId(new_id) + ) for old_id, new_id in r_mapping.items() ), ), diff --git a/bigframes/core/compile/compiled.py b/bigframes/core/compile/compiled.py index 906bdb1f0d..93be998b5b 100644 --- a/bigframes/core/compile/compiled.py +++ b/bigframes/core/compile/compiled.py @@ -184,7 +184,7 @@ def _to_ibis_expr( # Special case for empty tables, since we can't create an empty # projection. if not self._columns: - return bigframes_vendored.ibis.memtable([]) + return self._table.select([bigframes_vendored.ibis.literal(1)]) table = self._table.select(self._columns) if fraction is not None: diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index dca204401e..ff5f1d61c8 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -62,9 +62,11 @@ def compile_sql( if ordered: node, limit = rewrites.pullup_limit_from_slice(node) node = nodes.bottom_up(node, rewrites.rewrite_slice) + # TODO: Extract out CTEs node, ordering = rewrites.pull_up_order( node, order_root=True, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) ir = self.compile_node(node) return ir.to_sql( order_by=ordering.all_ordering_columns, @@ -76,6 +78,7 @@ def compile_sql( node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) ir = self.compile_node(node) return ir.to_sql(selections=output_ids) @@ -86,6 +89,7 @@ def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) + node = rewrites.column_pruning(node) return self.compile_node(node).to_sql(limit=n_rows, selections=ids) def compile_raw( @@ -97,6 +101,7 @@ def compile_raw( node = nodes.bottom_up(node, rewrites.rewrite_slice) node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) + node = rewrites.column_pruning(node) ir = self.compile_node(node) sql = ir.to_sql() return sql, node.schema.to_bigquery(), ordering @@ -192,31 +197,34 @@ def compile_readtable(self, node: nodes.ReadTableNode): return self.compile_read_table_unordered(node.source, node.scan_list) def read_table_as_unordered_ibis( - self, source: nodes.BigqueryDataSource + self, + source: nodes.BigqueryDataSource, + scan_cols: typing.Sequence[str], ) -> ibis_types.Table: full_table_name = f"{source.table.project_id}.{source.table.dataset_id}.{source.table.table_id}" - used_columns = tuple(col.name for col in source.table.physical_schema) # Physical schema might include unused columns, unsupported datatypes like JSON physical_schema = ibis_bigquery.BigQuerySchema.to_ibis( - list(i for i in source.table.physical_schema if i.name in used_columns) + list(source.table.physical_schema) ) if source.at_time is not None or source.sql_predicate is not None: import bigframes.session._io.bigquery sql = bigframes.session._io.bigquery.to_query( full_table_name, - columns=used_columns, + columns=scan_cols, sql_predicate=source.sql_predicate, time_travel_timestamp=source.at_time, ) return ibis_bigquery.Backend().sql(schema=physical_schema, query=sql) else: - return ibis_api.table(physical_schema, full_table_name) + return ibis_api.table(physical_schema, full_table_name).select(scan_cols) def compile_read_table_unordered( self, source: nodes.BigqueryDataSource, scan: nodes.ScanList ): - ibis_table = self.read_table_as_unordered_ibis(source) + ibis_table = self.read_table_as_unordered_ibis( + source, scan_cols=[col.source_id for col in scan.items] + ) return compiled.UnorderedIR( ibis_table, tuple( @@ -291,7 +299,7 @@ def set_output_names( return nodes.SelectionNode( node, tuple( - (ex.DerefOp(old_id), ids.ColumnId(out_id)) + bigframes.core.nodes.AliasedRef(ex.DerefOp(old_id), ids.ColumnId(out_id)) for old_id, out_id in zip(node.ids, output_ids) ), ) diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 085d52daa6..88e084d79c 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -20,7 +20,7 @@ import functools import itertools import typing -from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple +from typing import Callable, cast, Iterable, Mapping, Optional, Sequence, Tuple, TypeVar import google.cloud.bigquery as bq @@ -44,6 +44,8 @@ COLUMN_SET = frozenset[bfet_ids.ColumnId] +Self = TypeVar("Self") + @dataclasses.dataclass(frozen=True) class Field: @@ -87,10 +89,17 @@ def child_nodes(self) -> typing.Sequence[BigFrameNode]: def row_count(self) -> typing.Optional[int]: return None + @abc.abstractmethod + def remap_vars( + self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> Self: + """Remap defined (in this node only) variables.""" + ... + @abc.abstractmethod def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> Self: """Remap variable references""" ... @@ -100,6 +109,10 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: """The variables defined in this node (as opposed to by child nodes).""" ... + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + @functools.cached_property def session(self): sessions = [] @@ -248,18 +261,11 @@ def planning_complexity(self) -> int: @abc.abstractmethod def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + self: Self, t: Callable[[BigFrameNode], BigFrameNode] + ) -> Self: """Apply a function to each child node.""" ... - @abc.abstractmethod - def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: - """Remap defined (in this node only) variables.""" - ... - @property def defines_namespace(self) -> bool: """ @@ -269,16 +275,6 @@ def defines_namespace(self) -> bool: """ return False - @functools.cached_property - def defined_variables(self) -> set[str]: - """Full set of variables defined in the namespace, even if not selected.""" - self_defined_variables = set(self.schema.names) - if self.defines_namespace: - return self_defined_variables - return self_defined_variables.union( - *(child.defined_variables for child in self.child_nodes) - ) - def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: return self._dtype_lookup[id] @@ -286,9 +282,6 @@ def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: def _dtype_lookup(self): return {field.id: field.dtype for field in self.fields} - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - return self.transform_children(lambda x: x.prune(used_cols)) - class AdditiveNode: """Definition of additive - if you drop added_fields, you end up with the descendent. @@ -336,7 +329,7 @@ def explicitly_ordered(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> UnaryNode: transformed = dataclasses.replace(self, child=t(self.child)) if self == transformed: # reusing existing object speeds up eq, and saves a small amount of memory @@ -406,12 +399,18 @@ def row_count(self) -> typing.Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> SliceNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> SliceNode: return self @@ -483,6 +482,10 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.indicator_col,) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset({self.left_col.id, self.right_col.id}) + @property def additive_base(self) -> BigFrameNode: return self.left_child @@ -490,9 +493,7 @@ def additive_base(self) -> BigFrameNode: def replace_additive_base(self, node: BigFrameNode): return dataclasses.replace(self, left_child=node) - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> InNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) @@ -501,17 +502,16 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - return self - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> InNode: return dataclasses.replace( self, indicator_col=mappings.get(self.indicator_col, self.indicator_col) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> InNode: return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True)) # type: ignore @@ -574,9 +574,20 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + (*l_cond.column_references, *r_cond.column_references) + for l_cond, r_cond in self.conditions + ) + ) + + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(*self.ids, *self.referenced_ids) + + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> JoinNode: transformed = dataclasses.replace( self, left_child=t(self.left_child), right_child=t(self.right_child) ) @@ -585,21 +596,14 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # If this is a cross join, make sure to select at least one column from each side - condition_cols = used_cols.union( - map(lambda x: x.id, itertools.chain.from_iterable(self.conditions)) - ) - return self.transform_children( - lambda x: x.prune(frozenset([*condition_cols, *used_cols])) - ) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> JoinNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> JoinNode: new_conds = tuple( ( l_cond.remap_column_refs(mappings, allow_partial_bindings=True), @@ -665,7 +669,7 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> ConcatNode: transformed = dataclasses.replace( self, children=tuple(t(child) for child in self.children) ) @@ -674,17 +678,15 @@ def transform_children( return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Make concat prunable, probably by redefining - return self - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ConcatNode: new_ids = tuple(mappings.get(id, id) for id in self.output_ids) return dataclasses.replace(self, output_ids=new_ids) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ConcatNode: return self @@ -735,25 +737,23 @@ def defines_namespace(self) -> bool: def transform_children( self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + ) -> FromRangeNode: transformed = dataclasses.replace(self, start=t(self.start), end=t(self.end)) if self == transformed: # reusing existing object speeds up eq, and saves a small amount of memory return self return transformed - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Make FromRangeNode prunable (or convert to other node types) - return self - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> FromRangeNode: return dataclasses.replace( self, output_id=mappings.get(self.output_id, self.output_id) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> FromRangeNode: return self @@ -774,9 +774,7 @@ def fast_offsets(self) -> bool: def fast_ordered_limit(self) -> bool: return False - def transform_children( - self, t: Callable[[BigFrameNode], BigFrameNode] - ) -> BigFrameNode: + def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafNode: return self @@ -785,6 +783,9 @@ class ScanItem(typing.NamedTuple): dtype: bigframes.dtypes.Dtype # Might be multiple logical types for a given physical source type source_id: str # Flexible enough for both local data and bq data + def with_id(self, id: bfet_ids.ColumnId) -> ScanItem: + return ScanItem(id, self.dtype, self.source_id) + @dataclasses.dataclass(frozen=True) class ScanList: @@ -841,25 +842,9 @@ def row_count(self) -> typing.Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(item.id for item in self.fields) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # Don't preoduce empty scan list no matter what, will result in broken sql syntax - # TODO: Handle more elegantly - new_scan_list = ScanList( - tuple(item for item in self.scan_list.items if item.id in used_cols) - or (self.scan_list.items[0],) - ) - return ReadLocalNode( - self.feather_bytes, - self.data_schema, - self.n_rows, - new_scan_list, - self.offsets_col, - self.session, - ) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ReadLocalNode: new_scan_list = ScanList( tuple( ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id) @@ -875,7 +860,9 @@ def remap_vars( self, scan_list=new_scan_list, offsets_col=new_offsets_col ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ReadLocalNode: return self @@ -1003,16 +990,9 @@ def row_count(self) -> typing.Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(item.id for item in self.scan_list.items) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - new_scan_list = ScanList( - tuple(item for item in self.scan_list.items if item.id in used_cols) - or (self.scan_list.items[0],) - ) - return dataclasses.replace(self, scan_list=new_scan_list) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ReadTableNode: new_scan_list = ScanList( tuple( ScanItem(mappings.get(item.id, item.id), item.dtype, item.source_id) @@ -1021,7 +1001,9 @@ def remap_vars( ) return dataclasses.replace(self, scan_list=new_scan_list) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ReadTableNode: return self def with_order_cols(self): @@ -1089,6 +1071,10 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.col_id,) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + @property def added_fields(self) -> Tuple[Field, ...]: return (Field(self.col_id, bigframes.dtypes.INT_DTYPE),) @@ -1097,22 +1083,17 @@ def added_fields(self) -> Tuple[Field, ...]: def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> PromoteOffsetsNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - if self.col_id not in used_cols: - return self.child.prune(used_cols) - else: - new_used = used_cols.difference([self.col_id]) - return self.transform_children(lambda x: x.prune(new_used)) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> PromoteOffsetsNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> PromoteOffsetsNode: return self @@ -1136,17 +1117,22 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - consumed_ids = used_cols.union(self.predicate.column_references) - pruned_child = self.child.prune(consumed_ids) - return FilterNode(pruned_child, self.predicate) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(self.ids) | self.referenced_ids + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset(self.predicate.column_references) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> FilterNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> FilterNode: return dataclasses.replace( self, predicate=self.predicate.remap_column_refs( @@ -1183,20 +1169,24 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - ordering_cols = itertools.chain.from_iterable( - map(lambda x: x.referenced_columns, self.by) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(self.ids) | self.referenced_ids + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) - consumed_ids = used_cols.union(ordering_cols) - pruned_child = self.child.prune(consumed_ids) - return OrderByNode(pruned_child, self.by) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> OrderByNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> OrderByNode: all_refs = set( itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) ) @@ -1233,20 +1223,43 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ReversedNode: return self - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ReversedNode: return self +class AliasedRef(typing.NamedTuple): + ref: ex.DerefOp + id: bfet_ids.ColumnId + + @classmethod + def identity(cls, id: bfet_ids.ColumnId) -> AliasedRef: + return cls(ex.DerefOp(id), id) + + def remap_vars( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> AliasedRef: + return AliasedRef(self.ref, mappings.get(self.id, self.id)) + + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> AliasedRef: + return AliasedRef(ex.DerefOp(mappings.get(self.ref.id, self.ref.id)), self.id) + + @dataclasses.dataclass(frozen=True, eq=False) class SelectionNode(UnaryNode): - input_output_pairs: typing.Tuple[ - typing.Tuple[ex.DerefOp, bigframes.core.identifiers.ColumnId], ... - ] + input_output_pairs: Tuple[AliasedRef, ...] def _validate(self): for ref, _ in self.input_output_pairs: @@ -1280,33 +1293,26 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.input_output_pairs) - def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: - return {ref.id: out_id for ref, out_id in self.input_output_pairs} - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - pruned_selections = ( - tuple( - select for select in self.input_output_pairs if select[1] in used_cols - ) - or self.input_output_pairs[:1] - ) - consumed_ids = frozenset(i[0].id for i in pruned_selections) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset(ref.id for ref, id in self.input_output_pairs) - pruned_child = self.child.prune(consumed_ids) - return SelectionNode(pruned_child, pruned_selections) + def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: + return {ref.id: id for ref, id in self.input_output_pairs} def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: - new_pairs = tuple( - (ref, mappings.get(id, id)) for ref, id in self.input_output_pairs + ) -> SelectionNode: + new_fields = tuple( + item.remap_vars(mappings) for item in self.input_output_pairs ) - return dataclasses.replace(self, input_output_pairs=new_pairs) + return dataclasses.replace(self, input_output_pairs=new_fields) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> SelectionNode: new_fields = tuple( - (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) - for ex, id in self.input_output_pairs + item.remap_refs(mappings) for item in self.input_output_pairs ) return dataclasses.replace(self, input_output_pairs=new_fields) # type: ignore @@ -1353,30 +1359,38 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.assignments) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + i[0].column_references for i in self.assignments + ) + ) + + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset( + itertools.chain.from_iterable( + ex.column_references for ex, id in self.assignments + ) + ) + @property def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - pruned_assignments = tuple(i for i in self.assignments if i[1] in used_cols) - if len(pruned_assignments) == 0: - return self.child.prune(used_cols) - consumed_ids = itertools.chain.from_iterable( - i[0].column_references for i in pruned_assignments - ) - pruned_child = self.child.prune(used_cols.union(consumed_ids)) - return ProjectionNode(pruned_child, pruned_assignments) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ProjectionNode: new_fields = tuple((ex, mappings.get(id, id)) for ex, id in self.assignments) return dataclasses.replace(self, assignments=new_fields) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> ProjectionNode: new_fields = tuple( (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) for ex, id in self.assignments @@ -1418,16 +1432,18 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.col_id,) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> RowCountNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): - return self - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # TODO: Handle row count pruning + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> RowCountNode: return self @@ -1487,33 +1503,31 @@ def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return tuple(id for _, id in self.aggregations) @property - def has_ordered_ops(self) -> bool: - return not all( - aggregate.op.order_independent for aggregate, _ in self.aggregations - ) - - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: + def consumed_ids(self) -> COLUMN_SET: by_ids = (ref.id for ref in self.by_column_ids) - pruned_aggs = ( - tuple(agg for agg in self.aggregations if agg[1] in used_cols) - or self.aggregations[:1] - ) agg_inputs = itertools.chain.from_iterable( - agg.column_references for agg, _ in pruned_aggs + agg.column_references for agg, _ in self.aggregations ) - consumed_ids = frozenset(itertools.chain(by_ids, agg_inputs)) - pruned_child = self.child.prune(consumed_ids) - return AggregateNode( - pruned_child, pruned_aggs, self.by_column_ids, dropna=self.dropna + order_ids = itertools.chain.from_iterable( + part.scalar_expression.column_references for part in self.order_by + ) + return frozenset(itertools.chain(by_ids, agg_inputs, order_ids)) + + @property + def has_ordered_ops(self) -> bool: + return not all( + aggregate.op.order_independent for aggregate, _ in self.aggregations ) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> AggregateNode: new_aggs = tuple((agg, mappings.get(id, id)) for agg, id in self.aggregations) return dataclasses.replace(self, aggregations=new_aggs) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> AggregateNode: new_aggs = tuple( (agg.remap_column_refs(mappings, allow_partial_bindings=True), id) for agg, id in self.aggregations @@ -1578,6 +1592,20 @@ def added_field(self) -> Field: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.output_name,) + @property + def consumed_ids(self) -> COLUMN_SET: + return frozenset( + set(self.ids).difference([self.output_name]).union(self.referenced_ids) + ) + + @property + def referenced_ids(self) -> COLUMN_SET: + return ( + frozenset() + .union(self.expression.column_references) + .union(self.window_spec.all_referenced_columns) + ) + @property def inherits_order(self) -> bool: # does the op both use ordering at all? and if so, can it inherit order? @@ -1590,27 +1618,19 @@ def inherits_order(self) -> bool: def additive_base(self) -> BigFrameNode: return self.child - def replace_additive_base(self, node: BigFrameNode): + def replace_additive_base(self, node: BigFrameNode) -> WindowOpNode: return dataclasses.replace(self, child=node) - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - if self.output_name not in used_cols: - return self.child.prune(used_cols) - consumed_ids = ( - used_cols.difference([self.output_name]) - .union(self.expression.column_references) - .union(self.window_spec.all_referenced_columns) - ) - return self.transform_children(lambda x: x.prune(consumed_ids)) - def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> WindowOpNode: return dataclasses.replace( self, output_name=mappings.get(self.output_name, self.output_name) ) - def remap_refs(self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId]): + def remap_refs( + self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + ) -> WindowOpNode: return dataclasses.replace( self, expression=self.expression.remap_column_refs( @@ -1646,14 +1666,18 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return () + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset() + def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> RandomSampleNode: return self def remap_refs( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> RandomSampleNode: return self @@ -1703,21 +1727,20 @@ def row_count(self) -> Optional[int]: def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: return (self.offsets_col,) if (self.offsets_col is not None) else () - def prune(self, used_cols: COLUMN_SET) -> BigFrameNode: - # Cannot prune explode op - consumed_ids = used_cols.union(ref.id for ref in self.column_ids) - return self.transform_children(lambda x: x.prune(consumed_ids)) + @property + def referenced_ids(self) -> COLUMN_SET: + return frozenset(ref.id for ref in self.column_ids) def remap_vars( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ExplodeNode: if (self.offsets_col is not None) and self.offsets_col in mappings: return dataclasses.replace(self, offsets_col=mappings[self.offsets_col]) return self def remap_refs( self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] - ) -> BigFrameNode: + ) -> ExplodeNode: new_ids = tuple(id.remap_column_refs(mappings) for id in self.column_ids) return dataclasses.replace(self, column_ids=new_ids) # type: ignore diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index f93186bf36..bf93fa51b6 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -17,6 +17,7 @@ from bigframes.core.rewrite.legacy_align import legacy_join_as_projection from bigframes.core.rewrite.operators import rewrite_timedelta_ops from bigframes.core.rewrite.order import pull_up_order +from bigframes.core.rewrite.pruning import column_pruning from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice __all__ = [ @@ -27,4 +28,5 @@ "pullup_limit_from_slice", "remap_variables", "pull_up_order", + "column_pruning", ] diff --git a/bigframes/core/rewrite/implicit_align.py b/bigframes/core/rewrite/implicit_align.py index 1b864fb919..1989b1a543 100644 --- a/bigframes/core/rewrite/implicit_align.py +++ b/bigframes/core/rewrite/implicit_align.py @@ -113,7 +113,7 @@ def try_row_join( r_node, r_selection = pull_up_selection( r_node, stop=divergent_node, rename_vars=True ) # Rename only right vars to avoid collisions with left vars - combined_selection = (*l_selection, *r_selection) + combined_selection = l_selection + r_selection def _linearize_trees( base_tree: bigframes.core.nodes.BigFrameNode, @@ -139,10 +139,7 @@ def pull_up_selection( rename_vars: bool = False, ) -> Tuple[ bigframes.core.nodes.BigFrameNode, - Tuple[ - Tuple[bigframes.core.expression.DerefOp, bigframes.core.identifiers.ColumnId], - ..., - ], + Tuple[bigframes.core.nodes.AliasedRef, ...], ]: """Remove all selection nodes above the base node. Returns stripped tree. @@ -157,8 +154,7 @@ def pull_up_selection( """ if node == stop: # base case return node, tuple( - (bigframes.core.expression.DerefOp(field.id), field.id) - for field in node.fields + bigframes.core.nodes.AliasedRef.identity(field.id) for field in node.fields ) # InNode needs special handling, as its a binary node, but row identity is from left side only. # TODO: Merge code with unary op paths @@ -179,11 +175,15 @@ def pull_up_selection( {node.indicator_col: bigframes.core.identifiers.ColumnId.unique()} ), ) - added_selection = ( - bigframes.core.expression.DerefOp(new_in_node.indicator_col), - node.indicator_col, + added_selection = tuple( + ( + bigframes.core.nodes.AliasedRef( + bigframes.core.expression.DerefOp(new_in_node.indicator_col), + node.indicator_col, + ), + ) ) - new_selection = (*child_selections, added_selection) + new_selection = child_selections + added_selection return new_in_node, new_selection if isinstance(node, bigframes.core.nodes.AdditiveNode): @@ -204,28 +204,20 @@ def pull_up_selection( else: var_renames = {} assert isinstance(new_node, bigframes.core.nodes.AdditiveNode) - added_selections = ( - ( - bigframes.core.expression.DerefOp(var_renames.get(field.id, field.id)), - field.id, - ) + added_selections = tuple( + bigframes.core.nodes.AliasedRef.identity(field.id).remap_refs(var_renames) for field in node.added_fields ) - new_selection = (*child_selections, *added_selections) + new_selection = child_selections + added_selections return new_node, new_selection elif isinstance(node, bigframes.core.nodes.SelectionNode): child_node, child_selections = pull_up_selection( node.child, stop, rename_vars=rename_vars ) mapping = {out: ref.id for ref, out in child_selections} - new_selection = tuple( - ( - bigframes.core.expression.DerefOp(mapping[ref.id]), - out, - ) - for ref, out in node.input_output_pairs + return child_node, tuple( + ref.remap_refs(mapping) for ref in node.input_output_pairs ) - return child_node, new_selection raise ValueError(f"Couldn't pull up select from node: {node}") diff --git a/bigframes/core/rewrite/legacy_align.py b/bigframes/core/rewrite/legacy_align.py index 05641130fb..573a7026e4 100644 --- a/bigframes/core/rewrite/legacy_align.py +++ b/bigframes/core/rewrite/legacy_align.py @@ -57,7 +57,7 @@ def from_node_span( if isinstance(node, nodes.SelectionNode): return cls.from_node_span(node.child, target).select( - node.input_output_pairs + tuple(node.input_output_pairs) ) elif isinstance(node, nodes.ProjectionNode): return cls.from_node_span(node.child, target).project(node.assignments) @@ -228,7 +228,9 @@ def expand(self) -> nodes.BigFrameNode: root = nodes.FilterNode(child=root, predicate=self.predicate) if self.ordering: root = nodes.OrderByNode(child=root, by=self.ordering) - selection = tuple((scalar_exprs.DerefOp(id), id) for _, id in self.columns) + selection = tuple( + bigframes.core.nodes.AliasedRef.identity(id) for _, id in self.columns + ) return nodes.SelectionNode( child=nodes.ProjectionNode(child=root, assignments=self.columns), input_output_pairs=selection, diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 3f8c409b76..18e5004e1d 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -180,14 +180,10 @@ def pull_up_order_inner( col: bigframes.core.ids.ColumnId.unique() for col in unselected_order_cols } - all_selections = ( - *node.input_output_pairs, - *( - (bigframes.core.expression.DerefOp(k), v) - for k, v in new_selections.items() - ), + all_selections = node.input_output_pairs + tuple( + bigframes.core.nodes.AliasedRef(bigframes.core.expression.DerefOp(k), v) + for k, v in new_selections.items() ) - new_select_node = dataclasses.replace( node, child=child_result, input_output_pairs=all_selections ) @@ -288,7 +284,7 @@ def pull_order_concat( ) selection = tuple( ( - (bigframes.core.expression.DerefOp(id), id) + bigframes.core.nodes.AliasedRef.identity(id) for id in (*source.ids, table_id, offsets_id) ) ) @@ -396,7 +392,7 @@ def remove_order_strict( if result.ids != node.ids: return bigframes.core.nodes.SelectionNode( result, - tuple((bigframes.core.expression.DerefOp(id), id) for id in node.ids), + tuple(bigframes.core.nodes.AliasedRef.identity(id) for id in node.ids), ) return result @@ -428,7 +424,7 @@ def rename_cols( result_node = bigframes.core.nodes.SelectionNode( node, tuple( - (bigframes.core.expression.DerefOp(id), mappings.get(id, id)) + bigframes.core.nodes.AliasedRef.identity(id).remap_vars(mappings) for id in node.ids ), ) diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py new file mode 100644 index 0000000000..0b8534116d --- /dev/null +++ b/bigframes/core/rewrite/pruning.py @@ -0,0 +1,195 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import dataclasses +import functools +from typing import AbstractSet + +import bigframes.core.identifiers +import bigframes.core.nodes + + +def column_pruning( + root: bigframes.core.nodes.BigFrameNode, +) -> bigframes.core.nodes.BigFrameNode: + return bigframes.core.nodes.top_down(root, prune_columns) + + +def to_fixed(max_iterations: int = 100): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + previous_result = None + current_result = func(*args, **kwargs) + attempts = 1 + + while attempts < max_iterations: + if current_result == previous_result: + return current_result + previous_result = current_result + current_result = func(current_result) + attempts += 1 + + return current_result + + return wrapper + + return decorator + + +@to_fixed(max_iterations=100) +def prune_columns(node: bigframes.core.nodes.BigFrameNode): + if isinstance(node, bigframes.core.nodes.SelectionNode): + result = prune_selection_child(node) + elif isinstance(node, bigframes.core.nodes.AggregateNode): + result = node.replace_child(prune_node(node.child, node.consumed_ids)) + elif isinstance(node, bigframes.core.nodes.InNode): + result = dataclasses.replace( + node, + right_child=prune_node(node.right_child, frozenset([node.right_col.id])), + ) + else: + result = node + return result + + +def prune_selection_child( + selection: bigframes.core.nodes.SelectionNode, +) -> bigframes.core.nodes.BigFrameNode: + child = selection.child + + # Important to check this first + if list(selection.ids) == list(child.ids): + return child + + if isinstance(child, bigframes.core.nodes.SelectionNode): + return selection.remap_refs( + {id: ref.id for ref, id in child.input_output_pairs} + ).replace_child(child.child) + elif isinstance(child, bigframes.core.nodes.AdditiveNode): + if not set(field.id for field in child.added_fields) & selection.consumed_ids: + return selection.replace_child(child.additive_base) + return selection.replace_child( + child.replace_additive_base( + prune_node( + child.additive_base, selection.consumed_ids | child.referenced_ids + ) + ) + ) + elif isinstance(child, bigframes.core.nodes.ConcatNode): + indices = [ + list(child.ids).index(ref.id) for ref, _ in selection.input_output_pairs + ] + new_children = [] + for concat_node in child.child_nodes: + cc_ids = tuple(concat_node.ids) + sub_selection = tuple( + bigframes.core.nodes.AliasedRef.identity(cc_ids[i]) for i in indices + ) + new_children.append( + bigframes.core.nodes.SelectionNode(concat_node, sub_selection) + ) + return bigframes.core.nodes.ConcatNode( + children=tuple(new_children), output_ids=tuple(selection.ids) + ) + # Nodes that pass through input columns + elif isinstance( + child, + ( + bigframes.core.nodes.RandomSampleNode, + bigframes.core.nodes.ReversedNode, + bigframes.core.nodes.OrderByNode, + bigframes.core.nodes.FilterNode, + bigframes.core.nodes.SliceNode, + bigframes.core.nodes.JoinNode, + bigframes.core.nodes.ExplodeNode, + ), + ): + ids = selection.consumed_ids | child.referenced_ids + return selection.replace_child( + child.transform_children(lambda x: prune_node(x, ids)) + ) + elif isinstance(child, bigframes.core.nodes.AggregateNode): + return selection.replace_child(prune_aggregate(child, selection.consumed_ids)) + elif isinstance(child, bigframes.core.nodes.LeafNode): + return selection.replace_child(prune_leaf(child, selection.consumed_ids)) + return selection + + +def prune_node( + node: bigframes.core.nodes.BigFrameNode, + ids: AbstractSet[bigframes.core.ids.ColumnId], +): + # This clause is important, ensures idempotency, so can reach fixed point + if not (set(node.ids) - ids): + return node + else: + return bigframes.core.nodes.SelectionNode( + node, + tuple( + bigframes.core.nodes.AliasedRef.identity(id) + for id in node.ids + if id in ids + ), + ) + + +def prune_aggregate( + node: bigframes.core.nodes.AggregateNode, + used_cols: AbstractSet[bigframes.core.ids.ColumnId], +) -> bigframes.core.nodes.AggregateNode: + pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols) + return dataclasses.replace(node, aggregations=pruned_aggs) + + +@functools.singledispatch +def prune_leaf( + node: bigframes.core.nodes.BigFrameNode, + used_cols: AbstractSet[bigframes.core.ids.ColumnId], +): + ... + + +@prune_leaf.register +def prune_readlocal( + node: bigframes.core.nodes.ReadLocalNode, + selection: AbstractSet[bigframes.core.ids.ColumnId], +) -> bigframes.core.nodes.ReadLocalNode: + new_scan_list = filter_scanlist(node.scan_list, selection) + return dataclasses.replace( + node, + scan_list=new_scan_list, + offsets_col=node.offsets_col if (node.offsets_col in selection) else None, + ) + + +@prune_leaf.register +def prune_readtable( + node: bigframes.core.nodes.ReadTableNode, + selection: AbstractSet[bigframes.core.ids.ColumnId], +) -> bigframes.core.nodes.ReadTableNode: + new_scan_list = filter_scanlist(node.scan_list, selection) + return dataclasses.replace(node, scan_list=new_scan_list) + + +def filter_scanlist( + scanlist: bigframes.core.nodes.ScanList, + ids: AbstractSet[bigframes.core.ids.ColumnId], +): + result = bigframes.core.nodes.ScanList( + tuple(item for item in scanlist.items if item.id in ids) + ) + if len(result.items) == 0: + # We need to select something, or stuff breaks + result = bigframes.core.nodes.ScanList(scanlist.items[:1]) + return result diff --git a/bigframes/core/rewrite/slices.py b/bigframes/core/rewrite/slices.py index 102ffcf773..87a7720e2f 100644 --- a/bigframes/core/rewrite/slices.py +++ b/bigframes/core/rewrite/slices.py @@ -120,7 +120,9 @@ def drop_cols( ) -> nodes.SelectionNode: # adding a whole node that redefines the schema is a lot of overhead, should do something more efficient selections = tuple( - (scalar_exprs.DerefOp(id), id) for id in node.ids if id not in drop_cols + nodes.AliasedRef(scalar_exprs.DerefOp(id), id) + for id in node.ids + if id not in drop_cols ) return nodes.SelectionNode(node, selections) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py index 652f04757b..a252f116dd 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/rewrites.py @@ -359,7 +359,7 @@ def wrap(node, _, **kwargs): return CTE(new) if node in ctes else new result = simplified.replace(wrap) - ctes = reversed([cte.parent for cte in result.find(CTE)]) + ctes = [cte.parent for cte in result.find(CTE, ordered=True)] return result, ctes diff --git a/third_party/bigframes_vendored/ibis/common/graph.py b/third_party/bigframes_vendored/ibis/common/graph.py index 1a3fc6c543..6e7995ec03 100644 --- a/third_party/bigframes_vendored/ibis/common/graph.py +++ b/third_party/bigframes_vendored/ibis/common/graph.py @@ -343,6 +343,7 @@ def find( finder: FinderLike, filter: Optional[FinderLike] = None, context: Optional[dict] = None, + ordered: bool = False, ) -> list[Node]: """Find all nodes matching a given pattern or type in the graph. @@ -360,6 +361,8 @@ def find( the given filter and stop otherwise. context Optional context to use if `finder` or `filter` is a pattern. + ordered + Emit nodes in topological order if `True`. Returns ------- @@ -369,6 +372,8 @@ def find( """ graph = Graph.from_bfs(self, filter=filter, context=context) finder = _coerce_finder(finder, context) + if ordered: + graph, _ = graph.toposort() return [node for node in graph.nodes() if finder(node)] @experimental From 3989fc24bd74e9d0f461f16b1dfb9b6aff7314d2 Mon Sep 17 00:00:00 2001 From: Garrett Wu <6505921+GarrettWu@users.noreply.github.com> Date: Fri, 7 Feb 2025 10:47:36 -0800 Subject: [PATCH 14/22] chore: add experimental MutimodalEmbeddingGenerator class (#1374) * chore: add experimental MutimodalEmbeddingGenerator class * fix --- bigframes/ml/llm.py | 148 +++++++++++++++++++++++++++++++++++++++++ bigframes/ml/loader.py | 2 + bigframes/ml/utils.py | 3 + 3 files changed, 153 insertions(+) diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index 7b66191a11..72c49e124b 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -57,6 +57,8 @@ _TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT, ) +_MULTIMODAL_EMBEDDING_001_ENDPOINT = "multimodalembedding@001" + _GEMINI_PRO_ENDPOINT = "gemini-pro" _GEMINI_1P5_PRO_PREVIEW_ENDPOINT = "gemini-1.5-pro-preview-0514" _GEMINI_1P5_PRO_FLASH_PREVIEW_ENDPOINT = "gemini-1.5-flash-preview-0514" @@ -762,6 +764,152 @@ def to_gbq(self, model_name: str, replace: bool = False) -> TextEmbeddingGenerat return new_model.session.read_gbq_model(model_name) +@log_adapter.class_logger +class MultimodalEmbeddingGenerator(base.RetriableRemotePredictor): + """Multimodal embedding generator LLM model. + + .. note:: + BigFrames Blob is still under experiments. It may not work and subject to change in the future. + + Args: + model_name (str, Default to "multimodalembedding@001"): + The model for multimodal embedding. Can set to "multimodalembedding@001". Multimodal-embedding models returns model embeddings for text, image and video inputs. + Default to "multimodalembedding@001". + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + Connection to connect with remote service. str of the format ... + If None, use default connection in session context. + """ + + def __init__( + self, + *, + model_name: Literal["multimodalembedding@001"] = "multimodalembedding@001", + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + if not bigframes.options.experiments.blob: + raise NotImplementedError() + self.model_name = model_name + self.session = session or global_session.get_global_session() + self.connection_name = connection_name + + self._bqml_model_factory = globals.bqml_model_factory() + self._bqml_model: core.BqmlModel = self._create_bqml_model() + + def _create_bqml_model(self): + # Parse and create connection if needed. + self.connection_name = self.session._create_bq_connection( + connection=self.connection_name, iam_role="aiplatform.user" + ) + + if self.model_name != _MULTIMODAL_EMBEDDING_001_ENDPOINT: + msg = _MODEL_NOT_SUPPORTED_WARNING.format( + model_name=self.model_name, + known_models=_MULTIMODAL_EMBEDDING_001_ENDPOINT, + ) + warnings.warn(msg) + + options = { + "endpoint": self.model_name, + } + return self._bqml_model_factory.create_remote_model( + session=self.session, connection_name=self.connection_name, options=options + ) + + @classmethod + def _from_bq( + cls, session: bigframes.Session, bq_model: bigquery.Model + ) -> MultimodalEmbeddingGenerator: + assert bq_model.model_type == "MODEL_TYPE_UNSPECIFIED" + assert "remoteModelInfo" in bq_model._properties + assert "endpoint" in bq_model._properties["remoteModelInfo"] + assert "connection" in bq_model._properties["remoteModelInfo"] + + # Parse the remote model endpoint + bqml_endpoint = bq_model._properties["remoteModelInfo"]["endpoint"] + model_connection = bq_model._properties["remoteModelInfo"]["connection"] + model_endpoint = bqml_endpoint.split("/")[-1] + + model = cls( + session=session, + model_name=model_endpoint, # type: ignore + connection_name=model_connection, + ) + + model._bqml_model = core.BqmlModel(session, bq_model) + return model + + @property + def _predict_func( + self, + ) -> Callable[ + [bigframes.dataframe.DataFrame, Mapping], bigframes.dataframe.DataFrame + ]: + return self._bqml_model.generate_embedding + + @property + def _status_col(self) -> str: + return _ML_GENERATE_EMBEDDING_STATUS + + def predict( + self, X: utils.ArrayType, *, max_retries: int = 0 + ) -> bigframes.dataframe.DataFrame: + """Predict the result from input DataFrame. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series): + Input DataFrame or Series, can contain one or more columns. If multiple columns are in the DataFrame, it must contain a "content" column for prediction. + The content column must be of string type or BigFrames Blob of image or video. + + max_retries (int, default 0): + Max number of retries if the prediction for any rows failed. Each try needs to make progress (i.e. has successfully predicted rows) to continue the retry. + Each retry will append newly succeeded rows. When the max retries are reached, the remaining rows (the ones without successful predictions) will be appended to the end of the result. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_input_columns + n_prediction_columns). Returns predicted values. + """ + if max_retries < 0: + raise ValueError( + f"max_retries must be larger than or equal to 0, but is {max_retries}." + ) + + (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session) + + if len(X.columns) == 1: + # BQML identified the column by name + col_label = cast(blocks.Label, X.columns[0]) + X = X.rename(columns={col_label: "content"}) + + # TODO(garrettwu): remove transform to ObjRefRuntime when BQML supports ObjRef as input + if X["content"].dtype == dtypes.OBJ_REF_DTYPE: + X["content"] = X["content"].blob._get_runtime("R", with_metadata=True) + + options = { + "flatten_json_output": True, + } + + return self._predict_and_retry(X, options=options, max_retries=max_retries) + + def to_gbq( + self, model_name: str, replace: bool = False + ) -> MultimodalEmbeddingGenerator: + """Save the model to BigQuery. + + Args: + model_name (str): + The name of the model. + replace (bool, default False): + Determine whether to replace if the model already exists. Default to False. + + Returns: + MultimodalEmbeddingGenerator: Saved model.""" + + new_model = self._bqml_model.copy(model_name, replace) + return new_model.session.read_gbq_model(model_name) + + @log_adapter.class_logger class GeminiTextGenerator(base.RetriableRemotePredictor): """Gemini text generator LLM model. diff --git a/bigframes/ml/loader.py b/bigframes/ml/loader.py index 5d52927ded..eef72584bc 100644 --- a/bigframes/ml/loader.py +++ b/bigframes/ml/loader.py @@ -75,6 +75,7 @@ llm._TEXT_EMBEDDING_005_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_EMBEDDING_004_ENDPOINT: llm.TextEmbeddingGenerator, llm._TEXT_MULTILINGUAL_EMBEDDING_002_ENDPOINT: llm.TextEmbeddingGenerator, + llm._MULTIMODAL_EMBEDDING_001_ENDPOINT: llm.MultimodalEmbeddingGenerator, } ) @@ -98,6 +99,7 @@ def from_bq( llm.PaLM2TextEmbeddingGenerator, llm.Claude3TextGenerator, llm.TextEmbeddingGenerator, + llm.MultimodalEmbeddingGenerator, pipeline.Pipeline, compose.ColumnTransformer, preprocessing.PreprocessingType, diff --git a/bigframes/ml/utils.py b/bigframes/ml/utils.py index e1620485d5..e034fd00f7 100644 --- a/bigframes/ml/utils.py +++ b/bigframes/ml/utils.py @@ -100,6 +100,9 @@ def parse_model_endpoint(model_endpoint: str) -> tuple[str, Optional[str]]: model_name = model_endpoint version = None + if model_endpoint.startswith("multimodalembedding"): + return model_name, version + at_idx = model_endpoint.find("@") if at_idx != -1: version = model_endpoint[at_idx + 1 :] From 48384bedf6e8fdcfc5d7edd12be8222131a05218 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Fri, 7 Feb 2025 11:44:47 -0800 Subject: [PATCH 15/22] chore: add missing bigframes package name in `pip install` in template notebook (#1376) --- notebooks/getting_started/bq_dataframes_template.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/getting_started/bq_dataframes_template.ipynb b/notebooks/getting_started/bq_dataframes_template.ipynb index 90186b297d..6b0682bb1a 100644 --- a/notebooks/getting_started/bq_dataframes_template.ipynb +++ b/notebooks/getting_started/bq_dataframes_template.ipynb @@ -118,7 +118,7 @@ "metadata": {}, "outputs": [], "source": [ - "#%pip install --upgrade" + "#%pip install --upgrade bigframes" ] }, { From b9292185a8abcd30c1d821f00f5f476291ae0243 Mon Sep 17 00:00:00 2001 From: Arwa Sharif <146148342+arwas11@users.noreply.github.com> Date: Mon, 10 Feb 2025 09:18:27 -0600 Subject: [PATCH 16/22] chore: fix GeoSeries doc (#1377) --- docs/templates/toc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index c17a1788df..d57ab1c8ac 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -209,7 +209,7 @@ name: bigframes.bigquery - items: - name: GeoSeries - uid: bigframes.geopandas + uid: bigframes.geopandas.GeoSeries name: bigframes.geopandas - items: - name: Overview From 24962cd98c5ab427c2aabc580801360b4293ebf3 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Mon, 10 Feb 2025 12:53:45 -0800 Subject: [PATCH 17/22] remove `UNIQUEIDENTIFIER` from the vendored ibis datatypes (#1379) This type has been removed in sqlglot head https://github.com/tobymao/sqlglot/commit/b12aba9be6043053f79ff50f7bdcdfdff19ddf52#diff-7857fedd1d1451b1b9a5b8efaa1cc292c02e7ee4f0d04d7e2f9d5bfb9565802c. --- third_party/bigframes_vendored/ibis/backends/sql/datatypes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py index 2fd0e9186e..fce0643783 100644 --- a/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py +++ b/third_party/bigframes_vendored/ibis/backends/sql/datatypes.py @@ -63,7 +63,6 @@ typecode.VARBINARY: dt.Binary, typecode.VARCHAR: dt.String, typecode.VARIANT: dt.JSON, - typecode.UNIQUEIDENTIFIER: dt.UUID, typecode.SET: partial(dt.Array, dt.string), ############################# # Unsupported sqlglot types # From b598aa8ef4f6dd0cbca7629d290c5e511cdc86fc Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 10 Feb 2025 15:40:15 -0800 Subject: [PATCH 18/22] chore: support addition between a timestamp and a timedelta (#1369) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore: support addition between a timestamp and a timedelta * test_timestamp_dff * fix conftest.py * support numpy and pyarrow timedelta literals * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix format * use local fixture for testing * Remove pyarrow duration scalar support. * fix format * remove redundant imports * fix mypy * update timedelta literals during tree rewrites * update type conversions in tests to make py 3.9 happy * fix add operator for integers --------- Co-authored-by: Owl Bot --- bigframes/core/compile/scalar_op_compiler.py | 5 + bigframes/core/rewrite/operators.py | 36 +++- bigframes/core/utils.py | 21 ++- bigframes/dtypes.py | 9 +- bigframes/operations/__init__.py | 5 +- bigframes/operations/numeric_ops.py | 8 +- bigframes/operations/timedelta_ops.py | 27 ++- tests/data/scalars.jsonl | 2 +- .../small/operations/test_timedeltas.py | 166 ++++++++++++++++++ tests/unit/core/test_bf_utils.py | 18 ++ 10 files changed, 287 insertions(+), 10 deletions(-) create mode 100644 tests/system/small/operations/test_timedeltas.py diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index ea642c20fd..3e5f10eca4 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -742,6 +742,11 @@ def timestamp_diff_op_impl(x: ibis_types.TimestampValue, y: ibis_types.Timestamp return x.delta(y, "microsecond") +@scalar_op_compiler.register_binary_op(ops.timestamp_add_op) +def timestamp_add_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerValue): + return x + y.to_interval("us") + + @scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True) def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp): supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"] diff --git a/bigframes/core/rewrite/operators.py b/bigframes/core/rewrite/operators.py index 3145a9e9ae..136e9cc220 100644 --- a/bigframes/core/rewrite/operators.py +++ b/bigframes/core/rewrite/operators.py @@ -19,7 +19,7 @@ from bigframes import dtypes from bigframes import operations as ops from bigframes.core import expression as ex -from bigframes.core import nodes, schema +from bigframes.core import nodes, schema, utils @dataclasses.dataclass @@ -50,7 +50,7 @@ def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _Ty return _TypedExpr(expr, schema.get_type(expr.id.sql)) if isinstance(expr, ex.ScalarConstantExpression): - return _TypedExpr(expr, expr.dtype) + return _rewrite_scalar_constant_expr(expr) if isinstance(expr, ex.OpExpression): updated_inputs = tuple( @@ -61,12 +61,23 @@ def _rewrite_expressions(expr: ex.Expression, schema: schema.ArraySchema) -> _Ty raise AssertionError(f"Unexpected expression type: {type(expr)}") +def _rewrite_scalar_constant_expr(expr: ex.ScalarConstantExpression) -> _TypedExpr: + if expr.dtype is dtypes.TIMEDELTA_DTYPE: + int_repr = utils.timedelta_to_micros(expr.value) # type: ignore + return _TypedExpr(ex.const(int_repr, expr.dtype), expr.dtype) + + return _TypedExpr(expr, expr.dtype) + + def _rewrite_op_expr( expr: ex.OpExpression, inputs: typing.Tuple[_TypedExpr, ...] ) -> _TypedExpr: if isinstance(expr.op, ops.SubOp): return _rewrite_sub_op(inputs[0], inputs[1]) + if isinstance(expr.op, ops.AddOp): + return _rewrite_add_op(inputs[0], inputs[1]) + input_types = tuple(map(lambda x: x.dtype, inputs)) return _TypedExpr(expr, expr.op.output_type(*input_types)) @@ -80,3 +91,24 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: result_op.as_expr(left.expr, right.expr), result_op.output_type(left.dtype, right.dtype), ) + + +def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr: + if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE: + return _TypedExpr( + ops.timestamp_add_op.as_expr(left.expr, right.expr), + ops.timestamp_add_op.output_type(left.dtype, right.dtype), + ) + + if left.dtype is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right.dtype): + # Re-arrange operands such that timestamp is always on the left and timedelta is + # always on the right. + return _TypedExpr( + ops.timestamp_add_op.as_expr(right.expr, left.expr), + ops.timestamp_add_op.output_type(right.dtype, left.dtype), + ) + + return _TypedExpr( + ops.add_op.as_expr(left.expr, right.expr), + ops.add_op.output_type(left.dtype, right.dtype), + ) diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index 7cb2ec7535..0198f12537 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import datetime import functools import re import typing @@ -18,6 +19,7 @@ import warnings import bigframes_vendored.pandas.io.common as vendored_pandas_io_common +import numpy as np import pandas as pd import pandas.api.types as pdtypes import typing_extensions @@ -187,9 +189,22 @@ def wrapper(*args, **kwargs): return decorator -def timedelta_to_micros(td: pd.Timedelta) -> int: - # td.value returns total nanoseconds. - return td.value // 1000 +def timedelta_to_micros( + timedelta: typing.Union[pd.Timedelta, datetime.timedelta, np.timedelta64] +) -> int: + if isinstance(timedelta, pd.Timedelta): + # pd.Timedelta.value returns total nanoseconds. + return timedelta.value // 1000 + + if isinstance(timedelta, np.timedelta64): + return timedelta.astype("timedelta64[us]").astype(np.int64) + + if isinstance(timedelta, datetime.timedelta): + return ( + (timedelta.days * 3600 * 24) + timedelta.seconds + ) * 1_000_000 + timedelta.microseconds + + raise TypeError(f"Unrecognized input type: {type(timedelta)}") def replace_timedeltas_with_micros(dataframe: pd.DataFrame) -> List[str]: diff --git a/bigframes/dtypes.py b/bigframes/dtypes.py index d5be2ca584..eed45e1dde 100644 --- a/bigframes/dtypes.py +++ b/bigframes/dtypes.py @@ -105,6 +105,9 @@ pd.Timestamp, datetime.date, datetime.time, + pd.Timedelta, + datetime.timedelta, + np.timedelta64, ] LOCAL_SCALAR_TYPES = typing.get_args(LOCAL_SCALAR_TYPE) @@ -420,7 +423,7 @@ def arrow_dtype_to_bigframes_dtype(arrow_dtype: pa.DataType) -> Dtype: return pd.ArrowDtype(arrow_dtype) if pa.types.is_duration(arrow_dtype): - return pd.ArrowDtype(arrow_dtype) + return TIMEDELTA_DTYPE # BigFrames doesn't distinguish between string and large_string because the # largest string (2 GB) is already larger than the largest BigQuery row. @@ -562,6 +565,10 @@ def _is_bigframes_dtype(dtype) -> bool: def _infer_dtype_from_python_type(type: type) -> Dtype: + if type in (datetime.timedelta, pd.Timedelta, np.timedelta64): + # Must check timedelta type first. Otherwise other branchs will be evaluated to true + # E.g. np.timedelta64 is a sublcass as np.integer + return TIMEDELTA_DTYPE if issubclass(type, (bool, np.bool_)): return BOOL_DTYPE if issubclass(type, (int, np.integer)): diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index d35fa2c5c2..88406317fe 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -103,6 +103,7 @@ from bigframes.operations.numeric_ops import ( abs_op, add_op, + AddOp, arccos_op, arccosh_op, arcsin_op, @@ -177,7 +178,7 @@ ) from bigframes.operations.struct_ops import StructFieldOp, StructOp from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op -from bigframes.operations.timedelta_ops import ToTimedeltaOp +from bigframes.operations.timedelta_ops import timestamp_add_op, ToTimedeltaOp __all__ = [ # Base ops @@ -249,6 +250,7 @@ "second_op", "normalize_op", # Timedelta ops + "timestamp_add_op", "ToTimedeltaOp", # Datetime ops "date_op", @@ -263,6 +265,7 @@ # Numeric ops "abs_op", "add_op", + "AddOp", "arccos_op", "arccosh_op", "arcsin_op", diff --git a/bigframes/operations/numeric_ops.py b/bigframes/operations/numeric_ops.py index 413d8d66e1..5183e5c4c5 100644 --- a/bigframes/operations/numeric_ops.py +++ b/bigframes/operations/numeric_ops.py @@ -116,12 +116,18 @@ def output_type(self, *input_types): if all(map(dtypes.is_string_like, input_types)) and len(set(input_types)) == 1: # String addition return input_types[0] + + # Timestamp addition. + if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE: + return left_type + if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type): + return right_type + if (left_type is None or dtypes.is_numeric(left_type)) and ( right_type is None or dtypes.is_numeric(right_type) ): # Numeric addition return dtypes.coerce_to_common(left_type, right_type) - # TODO: Add temporal addition once delta types supported raise TypeError(f"Cannot add dtypes {left_type} and {right_type}") diff --git a/bigframes/operations/timedelta_ops.py b/bigframes/operations/timedelta_ops.py index f5b82c2331..69e054fa5c 100644 --- a/bigframes/operations/timedelta_ops.py +++ b/bigframes/operations/timedelta_ops.py @@ -25,7 +25,32 @@ class ToTimedeltaOp(base_ops.UnaryOp): name: typing.ClassVar[str] = "to_timedelta" unit: typing.Literal["us", "ms", "s", "m", "h", "d", "W"] - def output_type(self, *input_types): + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: if input_types[0] in (dtypes.INT_DTYPE, dtypes.FLOAT_DTYPE): return dtypes.TIMEDELTA_DTYPE raise TypeError("expected integer or float input") + + +@dataclasses.dataclass(frozen=True) +class TimestampAdd(base_ops.BinaryOp): + name: typing.ClassVar[str] = "timestamp_add" + + def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType: + # timestamp + timedelta => timestamp + if ( + dtypes.is_datetime_like(input_types[0]) + and input_types[1] is dtypes.TIMEDELTA_DTYPE + ): + return input_types[0] + # timedelta + timestamp => timestamp + if input_types[0] is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like( + input_types[1] + ): + return input_types[1] + + raise TypeError( + f"unsupported types for timestamp_add. left: {input_types[0]} right: {input_types[1]}" + ) + + +timestamp_add_op = TimestampAdd() diff --git a/tests/data/scalars.jsonl b/tests/data/scalars.jsonl index 172a55ec11..03755c94b7 100644 --- a/tests/data/scalars.jsonl +++ b/tests/data/scalars.jsonl @@ -6,4 +6,4 @@ {"bool_col": false, "bytes_col": "R8O8dGVuIFRhZw==", "date_col": "1980-03-14", "datetime_col": "1980-03-14 15:16:17", "geography_col": null, "int64_col": "55555", "int64_too": "0", "numeric_col": "5.555555", "float64_col": "555.555", "rowindex": 5, "rowindex_2": 5, "string_col": "Güten Tag!", "time_col": "15:16:17.181921", "timestamp_col": "1980-03-14T15:16:17.181921Z"} {"bool_col": true, "bytes_col": "SGVsbG8JQmlnRnJhbWVzIQc=", "date_col": "2023-05-23", "datetime_col": "2023-05-23 11:37:01", "geography_col": "MULTIPOINT (20 20, 10 40, 40 30, 30 10)", "int64_col": "101202303", "int64_too": "2", "numeric_col": "-10.090807", "float64_col": "-123.456", "rowindex": 6, "rowindex_2": 6, "string_col": "capitalize, This ", "time_col": "01:02:03.456789", "timestamp_col": "2023-05-23T11:42:55.000001Z"} {"bool_col": true, "bytes_col": null, "date_col": "2038-01-20", "datetime_col": "2038-01-19 03:14:08", "geography_col": null, "int64_col": "-214748367", "int64_too": "2", "numeric_col": "11111111.1", "float64_col": "42.42", "rowindex": 7, "rowindex_2": 7, "string_col": " سلام", "time_col": "12:00:00.000001", "timestamp_col": "2038-01-19T03:14:17.999999Z"} -{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} +{"bool_col": false, "bytes_col": null, "date_col": null, "datetime_col": null, "geography_col": null, "int64_col": "2", "int64_too": "1", "numeric_col": null, "float64_col": "6.87", "rowindex": 8, "rowindex_2": 8, "string_col": "T", "time_col": null, "timestamp_col": null} \ No newline at end of file diff --git a/tests/system/small/operations/test_timedeltas.py b/tests/system/small/operations/test_timedeltas.py new file mode 100644 index 0000000000..6c44a62686 --- /dev/null +++ b/tests/system/small/operations/test_timedeltas.py @@ -0,0 +1,166 @@ +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import datetime + +import numpy as np +import pandas as pd +import pandas.testing +import pytest + + +@pytest.fixture(scope="module") +def temporal_dfs(session): + pandas_df = pd.DataFrame( + { + "datetime_col": [ + pd.Timestamp("2025-02-01 01:00:01"), + pd.Timestamp("2019-01-02 02:00:00"), + ], + "timestamp_col": [ + pd.Timestamp("2023-01-01 01:00:01", tz="UTC"), + pd.Timestamp("2024-01-02 02:00:00", tz="UTC"), + ], + "timedelta_col": [pd.Timedelta(3, "s"), pd.Timedelta(-4, "d")], + } + ) + + bigframes_df = session.read_pandas(pandas_df) + + return bigframes_df, pandas_df + + +@pytest.mark.parametrize( + ("column", "pd_dtype"), + [ + ("datetime_col", " Date: Mon, 10 Feb 2025 17:10:12 -0800 Subject: [PATCH 19/22] refactor: clean up bigframes/core/__init__.py (#1380) * refactor: move ArrayValue out of init.py * fix mypy * Remove unused import * make identifier import consistent --- bigframes/core/__init__.py | 540 +--------------------------- bigframes/core/array_value.py | 553 +++++++++++++++++++++++++++++ bigframes/core/blocks.py | 7 +- bigframes/core/groupby/__init__.py | 10 +- bigframes/core/nodes.py | 154 ++++---- bigframes/core/rewrite/order.py | 42 ++- bigframes/core/rewrite/pruning.py | 14 +- bigframes/core/window/__init__.py | 5 +- 8 files changed, 668 insertions(+), 657 deletions(-) create mode 100644 bigframes/core/array_value.py diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index dc9b8e3b9b..2f3f15953c 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -11,543 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import annotations -from dataclasses import dataclass -import datetime -import functools -import io -import typing -from typing import Iterable, List, Optional, Sequence, Tuple -import warnings +from bigframes.core.array_value import ArrayValue -import google.cloud.bigquery -import pandas -import pyarrow as pa -import pyarrow.feather as pa_feather - -import bigframes.core.expression as ex -import bigframes.core.guid -import bigframes.core.identifiers as ids -import bigframes.core.join_def as join_def -import bigframes.core.local_data as local_data -import bigframes.core.nodes as nodes -from bigframes.core.ordering import OrderingExpression -import bigframes.core.ordering as orderings -import bigframes.core.schema as schemata -import bigframes.core.tree_properties -import bigframes.core.utils -from bigframes.core.window_spec import WindowSpec -import bigframes.dtypes -import bigframes.exceptions as bfe -import bigframes.operations as ops -import bigframes.operations.aggregations as agg_ops - -if typing.TYPE_CHECKING: - from bigframes.session import Session - -ORDER_ID_COLUMN = "bigframes_ordering_id" -PREDICATE_COLUMN = "bigframes_predicate" - - -@dataclass(frozen=True) -class ArrayValue: - """ - ArrayValue is an immutable type representing a 2D array with per-column types. - """ - - node: nodes.BigFrameNode - - @classmethod - def from_pyarrow(cls, arrow_table: pa.Table, session: Session): - adapted_table = local_data.adapt_pa_table(arrow_table) - schema = local_data.arrow_schema_to_bigframes(adapted_table.schema) - - iobytes = io.BytesIO() - pa_feather.write_feather(adapted_table, iobytes) - # Scan all columns by default, we define this list as it can be pruned while preserving source_def - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) - for item in schema.items - ) - ) - - node = nodes.ReadLocalNode( - iobytes.getvalue(), - data_schema=schema, - session=session, - n_rows=arrow_table.num_rows, - scan_list=scan_list, - ) - return cls(node) - - @classmethod - def from_range(cls, start, end, step): - return cls( - nodes.FromRangeNode( - start=start.node, - end=end.node, - step=step, - ) - ) - - @classmethod - def from_table( - cls, - table: google.cloud.bigquery.Table, - schema: schemata.ArraySchema, - session: Session, - *, - predicate: Optional[str] = None, - at_time: Optional[datetime.datetime] = None, - primary_key: Sequence[str] = (), - offsets_col: Optional[str] = None, - ): - if offsets_col and primary_key: - raise ValueError("must set at most one of 'offests', 'primary_key'") - if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): - msg = ( - "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" - "in preview; this behavior may change in future versions." - ) - warnings.warn(msg, bfe.PreviewWarning) - # define data source only for needed columns, this makes row-hashing cheaper - table_def = nodes.GbqTable.from_table(table, columns=schema.names) - - # create ordering from info - ordering = None - if offsets_col: - ordering = orderings.TotalOrdering.from_offset_col(offsets_col) - elif primary_key: - ordering = orderings.TotalOrdering.from_primary_key( - [ids.ColumnId(key_part) for key_part in primary_key] - ) - - # Scan all columns by default, we define this list as it can be pruned while preserving source_def - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) - for item in schema.items - ) - ) - source_def = nodes.BigqueryDataSource( - table=table_def, at_time=at_time, sql_predicate=predicate, ordering=ordering - ) - node = nodes.ReadTableNode( - source=source_def, - scan_list=scan_list, - table_session=session, - ) - return cls(node) - - @property - def column_ids(self) -> typing.Sequence[str]: - """Returns column ids as strings.""" - return self.schema.names - - @property - def session(self) -> Session: - required_session = self.node.session - from bigframes import get_global_session - - return ( - required_session if (required_session is not None) else get_global_session() - ) - - @functools.cached_property - def schema(self) -> schemata.ArraySchema: - return self.node.schema - - @property - def explicitly_ordered(self) -> bool: - # see BigFrameNode.explicitly_ordered - return self.node.explicitly_ordered - - @property - def order_ambiguous(self) -> bool: - # see BigFrameNode.order_ambiguous - return self.node.order_ambiguous - - @property - def supports_fast_peek(self) -> bool: - return bigframes.core.tree_properties.can_fast_peek(self.node) - - def as_cached( - self: ArrayValue, - cache_table: google.cloud.bigquery.Table, - ordering: Optional[orderings.RowOrdering], - ) -> ArrayValue: - """ - Replace the node with an equivalent one that references a table where the value has been materialized to. - """ - table = nodes.GbqTable.from_table(cache_table) - source = nodes.BigqueryDataSource(table, ordering=ordering) - # Assumption: GBQ cached table uses field name as bq column name - scan_list = nodes.ScanList( - tuple( - nodes.ScanItem(field.id, field.dtype, field.id.name) - for field in self.node.fields - ) - ) - node = nodes.CachedTableNode( - original_node=self.node, - source=source, - table_session=self.session, - scan_list=scan_list, - ) - return ArrayValue(node) - - def _try_evaluate_local(self): - """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" - import bigframes.core.compile - - return bigframes.core.compile.test_only_try_evaluate(self.node) - - def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: - return self.schema.get_type(key) - - def row_count(self) -> ArrayValue: - """Get number of rows in ArrayValue as a single-entry ArrayValue.""" - return ArrayValue(nodes.RowCountNode(child=self.node)) - - # Operations - def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: - """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" - predicate: ex.Expression = ex.deref(predicate_id) - if keep_null: - predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) - return self.filter(predicate) - - def filter(self, predicate: ex.Expression): - return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate)) - - def order_by( - self, by: Sequence[OrderingExpression], is_total_order: bool = False - ) -> ArrayValue: - return ArrayValue( - nodes.OrderByNode( - child=self.node, by=tuple(by), is_total_order=is_total_order - ) - ) - - def reversed(self) -> ArrayValue: - return ArrayValue(nodes.ReversedNode(child=self.node)) - - def slice( - self, start: Optional[int], stop: Optional[int], step: Optional[int] - ) -> ArrayValue: - if self.node.order_ambiguous and not (self.session._strictly_ordered): - msg = "Window ordering may be ambiguous, this can cause unstable results." - warnings.warn(msg, bfe.AmbiguousWindowWarning) - return ArrayValue( - nodes.SliceNode( - self.node, - start=start, - stop=stop, - step=step if (step is not None) else 1, - ) - ) - - def promote_offsets(self) -> Tuple[ArrayValue, str]: - """ - Convenience function to promote copy of column offsets to a value column. Can be used to reset index. - """ - col_id = self._gen_namespaced_uid() - if self.node.order_ambiguous and not (self.session._strictly_ordered): - if not self.session._allows_ambiguity: - raise ValueError( - "Generating offsets not supported in partial ordering mode" - ) - else: - msg = ( - "Window ordering may be ambiguous, this can cause unstable results." - ) - warnings.warn(msg, category=bfe.AmbiguousWindowWarning) - - return ( - ArrayValue( - nodes.PromoteOffsetsNode(child=self.node, col_id=ids.ColumnId(col_id)) - ), - col_id, - ) - - def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: - """Append together multiple ArrayValue objects.""" - return ArrayValue( - nodes.ConcatNode( - children=tuple([self.node, *[val.node for val in other]]), - output_ids=tuple( - ids.ColumnId(bigframes.core.guid.generate_guid()) - for id in self.column_ids - ), - ) - ) - - def compute_values(self, assignments: Sequence[ex.Expression]): - col_ids = self._gen_namespaced_uids(len(assignments)) - ex_id_pairs = tuple( - (ex, ids.ColumnId(id)) for ex, id in zip(assignments, col_ids) - ) - return ( - ArrayValue(nodes.ProjectionNode(child=self.node, assignments=ex_id_pairs)), - col_ids, - ) - - def project_to_id(self, expression: ex.Expression): - array_val, ids = self.compute_values( - [expression], - ) - return array_val, ids[0] - - def assign(self, source_id: str, destination_id: str) -> ArrayValue: - if destination_id in self.column_ids: # Mutate case - exprs = [ - ( - bigframes.core.nodes.AliasedRef( - ex.deref(source_id if (col_id == destination_id) else col_id), - ids.ColumnId(col_id), - ) - ) - for col_id in self.column_ids - ] - else: # append case - self_projection = ( - bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) - for col_id in self.column_ids - ) - exprs = [ - *self_projection, - ( - bigframes.core.nodes.AliasedRef( - ex.deref(source_id), ids.ColumnId(destination_id) - ) - ), - ] - return ArrayValue( - nodes.SelectionNode( - child=self.node, - input_output_pairs=tuple(exprs), - ) - ) - - def create_constant( - self, - value: typing.Any, - dtype: typing.Optional[bigframes.dtypes.Dtype], - ) -> Tuple[ArrayValue, str]: - if pandas.isna(value): - # Need to assign a data type when value is NaN. - dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE - - return self.project_to_id(ex.const(value, dtype)) - - def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: - # This basically just drops and reorders columns - logically a no-op except as a final step - selections = ( - bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) - for col_id in column_ids - ) - return ArrayValue( - nodes.SelectionNode( - child=self.node, - input_output_pairs=tuple(selections), - ) - ) - - def drop_columns(self, columns: Iterable[str]) -> ArrayValue: - return self.select_columns( - [col_id for col_id in self.column_ids if col_id not in columns] - ) - - def aggregate( - self, - aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], - by_column_ids: typing.Sequence[str] = (), - dropna: bool = True, - ) -> ArrayValue: - """ - Apply aggregations to the expression. - Arguments: - aggregations: input_column_id, operation, output_column_id tuples - by_column_id: column id of the aggregation key, this is preserved through the transform - dropna: whether null keys should be dropped - """ - agg_defs = tuple((agg, ids.ColumnId(name)) for agg, name in aggregations) - return ArrayValue( - nodes.AggregateNode( - child=self.node, - aggregations=agg_defs, - by_column_ids=tuple(map(ex.deref, by_column_ids)), - dropna=dropna, - ) - ) - - def project_window_op( - self, - column_name: str, - op: agg_ops.UnaryWindowOp, - window_spec: WindowSpec, - *, - never_skip_nulls=False, - skip_reproject_unsafe: bool = False, - ) -> Tuple[ArrayValue, str]: - """ - Creates a new expression based on this expression with unary operation applied to one column. - column_name: the id of the input column present in the expression - op: the windowable operator to apply to the input column - window_spec: a specification of the window over which to apply the operator - output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided - never_skip_nulls: will disable null skipping for operators that would otherwise do so - skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection - """ - # TODO: Support non-deterministic windowing - if window_spec.row_bounded or not op.order_independent: - if self.node.order_ambiguous and not self.session._strictly_ordered: - if not self.session._allows_ambiguity: - raise ValueError( - "Generating offsets not supported in partial ordering mode" - ) - else: - msg = "Window ordering may be ambiguous, this can cause unstable results." - warnings.warn(msg, category=bfe.AmbiguousWindowWarning) - - output_name = self._gen_namespaced_uid() - return ( - ArrayValue( - nodes.WindowOpNode( - child=self.node, - expression=ex.UnaryAggregation(op, ex.deref(column_name)), - window_spec=window_spec, - output_name=ids.ColumnId(output_name), - never_skip_nulls=never_skip_nulls, - skip_reproject_unsafe=skip_reproject_unsafe, - ) - ), - output_name, - ) - - def isin( - self, other: ArrayValue, lcol: str, rcol: str - ) -> typing.Tuple[ArrayValue, str]: - node = nodes.InNode( - self.node, - other.node, - ex.deref(lcol), - ex.deref(rcol), - indicator_col=ids.ColumnId.unique(), - ) - return ArrayValue(node), node.indicator_col.name - - def relational_join( - self, - other: ArrayValue, - conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), - type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", - ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: - l_mapping = { # Identity mapping, only rename right side - lcol.name: lcol.name for lcol in self.node.ids - } - other_node, r_mapping = self.prepare_join_names(other) - join_node = nodes.JoinNode( - left_child=self.node, - right_child=other_node, - conditions=tuple( - (ex.deref(l_mapping[l_col]), ex.deref(r_mapping[r_col])) - for l_col, r_col in conditions - ), - type=type, - ) - return ArrayValue(join_node), (l_mapping, r_mapping) - - def try_row_join( - self, - other: ArrayValue, - conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), - ) -> Optional[ - typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]] - ]: - l_mapping = { # Identity mapping, only rename right side - lcol.name: lcol.name for lcol in self.node.ids - } - other_node, r_mapping = self.prepare_join_names(other) - import bigframes.core.rewrite - - result_node = bigframes.core.rewrite.try_row_join( - self.node, other_node, conditions - ) - if result_node is None: - return None - - return ( - ArrayValue(result_node), - (l_mapping, r_mapping), - ) - - def prepare_join_names( - self, other: ArrayValue - ) -> Tuple[bigframes.core.nodes.BigFrameNode, dict[str, str]]: - if set(other.node.ids) & set(self.node.ids): - r_mapping = { # Rename conflicting names - rcol.name: rcol.name - if (rcol.name not in self.column_ids) - else bigframes.core.guid.generate_guid() - for rcol in other.node.ids - } - return ( - nodes.SelectionNode( - other.node, - tuple( - bigframes.core.nodes.AliasedRef( - ex.deref(old_id), ids.ColumnId(new_id) - ) - for old_id, new_id in r_mapping.items() - ), - ), - r_mapping, - ) - else: - return other.node, {id: id for id in other.column_ids} - - def try_legacy_row_join( - self, - other: ArrayValue, - join_type: join_def.JoinType, - join_keys: typing.Tuple[join_def.CoalescedColumnMapping, ...], - mappings: typing.Tuple[join_def.JoinColumnMapping, ...], - ) -> typing.Optional[ArrayValue]: - import bigframes.core.rewrite - - result = bigframes.core.rewrite.legacy_join_as_projection( - self.node, other.node, join_keys, mappings, join_type - ) - if result is not None: - return ArrayValue(result) - return None - - def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: - assert len(column_ids) > 0 - for column_id in column_ids: - assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) - - offsets = tuple(ex.deref(id) for id in column_ids) - return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets)) - - def _uniform_sampling(self, fraction: float) -> ArrayValue: - """Sampling the table on given fraction. - - .. warning:: - The row numbers of result is non-deterministic, avoid to use. - """ - return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) - - # Deterministically generate namespaced ids for new variables - # These new ids are only unique within the current namespace. - # Many operations, such as joins, create new namespaces. See: BigFrameNode.defines_namespace - # When migrating to integer ids, these will generate the next available integer, in order to densely pack ids - # this will help represent variables sets as compact bitsets - def _gen_namespaced_uid(self) -> str: - return self._gen_namespaced_uids(1)[0] - - def _gen_namespaced_uids(self, n: int) -> List[str]: - return [ids.ColumnId.unique().name for _ in range(n)] +__all__ = ["ArrayValue"] diff --git a/bigframes/core/array_value.py b/bigframes/core/array_value.py new file mode 100644 index 0000000000..dc9b8e3b9b --- /dev/null +++ b/bigframes/core/array_value.py @@ -0,0 +1,553 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +from dataclasses import dataclass +import datetime +import functools +import io +import typing +from typing import Iterable, List, Optional, Sequence, Tuple +import warnings + +import google.cloud.bigquery +import pandas +import pyarrow as pa +import pyarrow.feather as pa_feather + +import bigframes.core.expression as ex +import bigframes.core.guid +import bigframes.core.identifiers as ids +import bigframes.core.join_def as join_def +import bigframes.core.local_data as local_data +import bigframes.core.nodes as nodes +from bigframes.core.ordering import OrderingExpression +import bigframes.core.ordering as orderings +import bigframes.core.schema as schemata +import bigframes.core.tree_properties +import bigframes.core.utils +from bigframes.core.window_spec import WindowSpec +import bigframes.dtypes +import bigframes.exceptions as bfe +import bigframes.operations as ops +import bigframes.operations.aggregations as agg_ops + +if typing.TYPE_CHECKING: + from bigframes.session import Session + +ORDER_ID_COLUMN = "bigframes_ordering_id" +PREDICATE_COLUMN = "bigframes_predicate" + + +@dataclass(frozen=True) +class ArrayValue: + """ + ArrayValue is an immutable type representing a 2D array with per-column types. + """ + + node: nodes.BigFrameNode + + @classmethod + def from_pyarrow(cls, arrow_table: pa.Table, session: Session): + adapted_table = local_data.adapt_pa_table(arrow_table) + schema = local_data.arrow_schema_to_bigframes(adapted_table.schema) + + iobytes = io.BytesIO() + pa_feather.write_feather(adapted_table, iobytes) + # Scan all columns by default, we define this list as it can be pruned while preserving source_def + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) + for item in schema.items + ) + ) + + node = nodes.ReadLocalNode( + iobytes.getvalue(), + data_schema=schema, + session=session, + n_rows=arrow_table.num_rows, + scan_list=scan_list, + ) + return cls(node) + + @classmethod + def from_range(cls, start, end, step): + return cls( + nodes.FromRangeNode( + start=start.node, + end=end.node, + step=step, + ) + ) + + @classmethod + def from_table( + cls, + table: google.cloud.bigquery.Table, + schema: schemata.ArraySchema, + session: Session, + *, + predicate: Optional[str] = None, + at_time: Optional[datetime.datetime] = None, + primary_key: Sequence[str] = (), + offsets_col: Optional[str] = None, + ): + if offsets_col and primary_key: + raise ValueError("must set at most one of 'offests', 'primary_key'") + if any(i.field_type == "JSON" for i in table.schema if i.name in schema.names): + msg = ( + "Interpreting JSON column(s) as the `db_dtypes.dbjson` extension type is" + "in preview; this behavior may change in future versions." + ) + warnings.warn(msg, bfe.PreviewWarning) + # define data source only for needed columns, this makes row-hashing cheaper + table_def = nodes.GbqTable.from_table(table, columns=schema.names) + + # create ordering from info + ordering = None + if offsets_col: + ordering = orderings.TotalOrdering.from_offset_col(offsets_col) + elif primary_key: + ordering = orderings.TotalOrdering.from_primary_key( + [ids.ColumnId(key_part) for key_part in primary_key] + ) + + # Scan all columns by default, we define this list as it can be pruned while preserving source_def + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(ids.ColumnId(item.column), item.dtype, item.column) + for item in schema.items + ) + ) + source_def = nodes.BigqueryDataSource( + table=table_def, at_time=at_time, sql_predicate=predicate, ordering=ordering + ) + node = nodes.ReadTableNode( + source=source_def, + scan_list=scan_list, + table_session=session, + ) + return cls(node) + + @property + def column_ids(self) -> typing.Sequence[str]: + """Returns column ids as strings.""" + return self.schema.names + + @property + def session(self) -> Session: + required_session = self.node.session + from bigframes import get_global_session + + return ( + required_session if (required_session is not None) else get_global_session() + ) + + @functools.cached_property + def schema(self) -> schemata.ArraySchema: + return self.node.schema + + @property + def explicitly_ordered(self) -> bool: + # see BigFrameNode.explicitly_ordered + return self.node.explicitly_ordered + + @property + def order_ambiguous(self) -> bool: + # see BigFrameNode.order_ambiguous + return self.node.order_ambiguous + + @property + def supports_fast_peek(self) -> bool: + return bigframes.core.tree_properties.can_fast_peek(self.node) + + def as_cached( + self: ArrayValue, + cache_table: google.cloud.bigquery.Table, + ordering: Optional[orderings.RowOrdering], + ) -> ArrayValue: + """ + Replace the node with an equivalent one that references a table where the value has been materialized to. + """ + table = nodes.GbqTable.from_table(cache_table) + source = nodes.BigqueryDataSource(table, ordering=ordering) + # Assumption: GBQ cached table uses field name as bq column name + scan_list = nodes.ScanList( + tuple( + nodes.ScanItem(field.id, field.dtype, field.id.name) + for field in self.node.fields + ) + ) + node = nodes.CachedTableNode( + original_node=self.node, + source=source, + table_session=self.session, + scan_list=scan_list, + ) + return ArrayValue(node) + + def _try_evaluate_local(self): + """Use only for unit testing paths - not fully featured. Will throw exception if fails.""" + import bigframes.core.compile + + return bigframes.core.compile.test_only_try_evaluate(self.node) + + def get_column_type(self, key: str) -> bigframes.dtypes.Dtype: + return self.schema.get_type(key) + + def row_count(self) -> ArrayValue: + """Get number of rows in ArrayValue as a single-entry ArrayValue.""" + return ArrayValue(nodes.RowCountNode(child=self.node)) + + # Operations + def filter_by_id(self, predicate_id: str, keep_null: bool = False) -> ArrayValue: + """Filter the table on a given expression, the predicate must be a boolean series aligned with the table expression.""" + predicate: ex.Expression = ex.deref(predicate_id) + if keep_null: + predicate = ops.fillna_op.as_expr(predicate, ex.const(True)) + return self.filter(predicate) + + def filter(self, predicate: ex.Expression): + return ArrayValue(nodes.FilterNode(child=self.node, predicate=predicate)) + + def order_by( + self, by: Sequence[OrderingExpression], is_total_order: bool = False + ) -> ArrayValue: + return ArrayValue( + nodes.OrderByNode( + child=self.node, by=tuple(by), is_total_order=is_total_order + ) + ) + + def reversed(self) -> ArrayValue: + return ArrayValue(nodes.ReversedNode(child=self.node)) + + def slice( + self, start: Optional[int], stop: Optional[int], step: Optional[int] + ) -> ArrayValue: + if self.node.order_ambiguous and not (self.session._strictly_ordered): + msg = "Window ordering may be ambiguous, this can cause unstable results." + warnings.warn(msg, bfe.AmbiguousWindowWarning) + return ArrayValue( + nodes.SliceNode( + self.node, + start=start, + stop=stop, + step=step if (step is not None) else 1, + ) + ) + + def promote_offsets(self) -> Tuple[ArrayValue, str]: + """ + Convenience function to promote copy of column offsets to a value column. Can be used to reset index. + """ + col_id = self._gen_namespaced_uid() + if self.node.order_ambiguous and not (self.session._strictly_ordered): + if not self.session._allows_ambiguity: + raise ValueError( + "Generating offsets not supported in partial ordering mode" + ) + else: + msg = ( + "Window ordering may be ambiguous, this can cause unstable results." + ) + warnings.warn(msg, category=bfe.AmbiguousWindowWarning) + + return ( + ArrayValue( + nodes.PromoteOffsetsNode(child=self.node, col_id=ids.ColumnId(col_id)) + ), + col_id, + ) + + def concat(self, other: typing.Sequence[ArrayValue]) -> ArrayValue: + """Append together multiple ArrayValue objects.""" + return ArrayValue( + nodes.ConcatNode( + children=tuple([self.node, *[val.node for val in other]]), + output_ids=tuple( + ids.ColumnId(bigframes.core.guid.generate_guid()) + for id in self.column_ids + ), + ) + ) + + def compute_values(self, assignments: Sequence[ex.Expression]): + col_ids = self._gen_namespaced_uids(len(assignments)) + ex_id_pairs = tuple( + (ex, ids.ColumnId(id)) for ex, id in zip(assignments, col_ids) + ) + return ( + ArrayValue(nodes.ProjectionNode(child=self.node, assignments=ex_id_pairs)), + col_ids, + ) + + def project_to_id(self, expression: ex.Expression): + array_val, ids = self.compute_values( + [expression], + ) + return array_val, ids[0] + + def assign(self, source_id: str, destination_id: str) -> ArrayValue: + if destination_id in self.column_ids: # Mutate case + exprs = [ + ( + bigframes.core.nodes.AliasedRef( + ex.deref(source_id if (col_id == destination_id) else col_id), + ids.ColumnId(col_id), + ) + ) + for col_id in self.column_ids + ] + else: # append case + self_projection = ( + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in self.column_ids + ) + exprs = [ + *self_projection, + ( + bigframes.core.nodes.AliasedRef( + ex.deref(source_id), ids.ColumnId(destination_id) + ) + ), + ] + return ArrayValue( + nodes.SelectionNode( + child=self.node, + input_output_pairs=tuple(exprs), + ) + ) + + def create_constant( + self, + value: typing.Any, + dtype: typing.Optional[bigframes.dtypes.Dtype], + ) -> Tuple[ArrayValue, str]: + if pandas.isna(value): + # Need to assign a data type when value is NaN. + dtype = dtype or bigframes.dtypes.DEFAULT_DTYPE + + return self.project_to_id(ex.const(value, dtype)) + + def select_columns(self, column_ids: typing.Sequence[str]) -> ArrayValue: + # This basically just drops and reorders columns - logically a no-op except as a final step + selections = ( + bigframes.core.nodes.AliasedRef.identity(ids.ColumnId(col_id)) + for col_id in column_ids + ) + return ArrayValue( + nodes.SelectionNode( + child=self.node, + input_output_pairs=tuple(selections), + ) + ) + + def drop_columns(self, columns: Iterable[str]) -> ArrayValue: + return self.select_columns( + [col_id for col_id in self.column_ids if col_id not in columns] + ) + + def aggregate( + self, + aggregations: typing.Sequence[typing.Tuple[ex.Aggregation, str]], + by_column_ids: typing.Sequence[str] = (), + dropna: bool = True, + ) -> ArrayValue: + """ + Apply aggregations to the expression. + Arguments: + aggregations: input_column_id, operation, output_column_id tuples + by_column_id: column id of the aggregation key, this is preserved through the transform + dropna: whether null keys should be dropped + """ + agg_defs = tuple((agg, ids.ColumnId(name)) for agg, name in aggregations) + return ArrayValue( + nodes.AggregateNode( + child=self.node, + aggregations=agg_defs, + by_column_ids=tuple(map(ex.deref, by_column_ids)), + dropna=dropna, + ) + ) + + def project_window_op( + self, + column_name: str, + op: agg_ops.UnaryWindowOp, + window_spec: WindowSpec, + *, + never_skip_nulls=False, + skip_reproject_unsafe: bool = False, + ) -> Tuple[ArrayValue, str]: + """ + Creates a new expression based on this expression with unary operation applied to one column. + column_name: the id of the input column present in the expression + op: the windowable operator to apply to the input column + window_spec: a specification of the window over which to apply the operator + output_name: the id to assign to the output of the operator, by default will replace input col if distinct output id not provided + never_skip_nulls: will disable null skipping for operators that would otherwise do so + skip_reproject_unsafe: skips the reprojection step, can be used when performing many non-dependent window operations, user responsible for not nesting window expressions, or using outputs as join, filter or aggregation keys before a reprojection + """ + # TODO: Support non-deterministic windowing + if window_spec.row_bounded or not op.order_independent: + if self.node.order_ambiguous and not self.session._strictly_ordered: + if not self.session._allows_ambiguity: + raise ValueError( + "Generating offsets not supported in partial ordering mode" + ) + else: + msg = "Window ordering may be ambiguous, this can cause unstable results." + warnings.warn(msg, category=bfe.AmbiguousWindowWarning) + + output_name = self._gen_namespaced_uid() + return ( + ArrayValue( + nodes.WindowOpNode( + child=self.node, + expression=ex.UnaryAggregation(op, ex.deref(column_name)), + window_spec=window_spec, + output_name=ids.ColumnId(output_name), + never_skip_nulls=never_skip_nulls, + skip_reproject_unsafe=skip_reproject_unsafe, + ) + ), + output_name, + ) + + def isin( + self, other: ArrayValue, lcol: str, rcol: str + ) -> typing.Tuple[ArrayValue, str]: + node = nodes.InNode( + self.node, + other.node, + ex.deref(lcol), + ex.deref(rcol), + indicator_col=ids.ColumnId.unique(), + ) + return ArrayValue(node), node.indicator_col.name + + def relational_join( + self, + other: ArrayValue, + conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), + type: typing.Literal["inner", "outer", "left", "right", "cross"] = "inner", + ) -> typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]]: + l_mapping = { # Identity mapping, only rename right side + lcol.name: lcol.name for lcol in self.node.ids + } + other_node, r_mapping = self.prepare_join_names(other) + join_node = nodes.JoinNode( + left_child=self.node, + right_child=other_node, + conditions=tuple( + (ex.deref(l_mapping[l_col]), ex.deref(r_mapping[r_col])) + for l_col, r_col in conditions + ), + type=type, + ) + return ArrayValue(join_node), (l_mapping, r_mapping) + + def try_row_join( + self, + other: ArrayValue, + conditions: typing.Tuple[typing.Tuple[str, str], ...] = (), + ) -> Optional[ + typing.Tuple[ArrayValue, typing.Tuple[dict[str, str], dict[str, str]]] + ]: + l_mapping = { # Identity mapping, only rename right side + lcol.name: lcol.name for lcol in self.node.ids + } + other_node, r_mapping = self.prepare_join_names(other) + import bigframes.core.rewrite + + result_node = bigframes.core.rewrite.try_row_join( + self.node, other_node, conditions + ) + if result_node is None: + return None + + return ( + ArrayValue(result_node), + (l_mapping, r_mapping), + ) + + def prepare_join_names( + self, other: ArrayValue + ) -> Tuple[bigframes.core.nodes.BigFrameNode, dict[str, str]]: + if set(other.node.ids) & set(self.node.ids): + r_mapping = { # Rename conflicting names + rcol.name: rcol.name + if (rcol.name not in self.column_ids) + else bigframes.core.guid.generate_guid() + for rcol in other.node.ids + } + return ( + nodes.SelectionNode( + other.node, + tuple( + bigframes.core.nodes.AliasedRef( + ex.deref(old_id), ids.ColumnId(new_id) + ) + for old_id, new_id in r_mapping.items() + ), + ), + r_mapping, + ) + else: + return other.node, {id: id for id in other.column_ids} + + def try_legacy_row_join( + self, + other: ArrayValue, + join_type: join_def.JoinType, + join_keys: typing.Tuple[join_def.CoalescedColumnMapping, ...], + mappings: typing.Tuple[join_def.JoinColumnMapping, ...], + ) -> typing.Optional[ArrayValue]: + import bigframes.core.rewrite + + result = bigframes.core.rewrite.legacy_join_as_projection( + self.node, other.node, join_keys, mappings, join_type + ) + if result is not None: + return ArrayValue(result) + return None + + def explode(self, column_ids: typing.Sequence[str]) -> ArrayValue: + assert len(column_ids) > 0 + for column_id in column_ids: + assert bigframes.dtypes.is_array_like(self.get_column_type(column_id)) + + offsets = tuple(ex.deref(id) for id in column_ids) + return ArrayValue(nodes.ExplodeNode(child=self.node, column_ids=offsets)) + + def _uniform_sampling(self, fraction: float) -> ArrayValue: + """Sampling the table on given fraction. + + .. warning:: + The row numbers of result is non-deterministic, avoid to use. + """ + return ArrayValue(nodes.RandomSampleNode(self.node, fraction)) + + # Deterministically generate namespaced ids for new variables + # These new ids are only unique within the current namespace. + # Many operations, such as joins, create new namespaces. See: BigFrameNode.defines_namespace + # When migrating to integer ids, these will generate the next available integer, in order to densely pack ids + # this will help represent variables sets as compact bitsets + def _gen_namespaced_uid(self) -> str: + return self._gen_namespaced_uids(1)[0] + + def _gen_namespaced_uids(self, n: int) -> List[str]: + return [ids.ColumnId.unique().name for _ in range(n)] diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index c6e3096e51..8d3732f3fe 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -48,6 +48,7 @@ import pandas as pd import pyarrow as pa +from bigframes import session import bigframes._config.sampling_options as sampling_options import bigframes.constants import bigframes.core as core @@ -257,7 +258,7 @@ def dtypes( return [self.expr.get_column_type(col) for col in self.value_columns] @property - def session(self) -> core.Session: + def session(self) -> session.Session: return self._expr.session @functools.cached_property @@ -2653,7 +2654,7 @@ def dtypes( ] @property - def session(self) -> core.Session: + def session(self) -> session.Session: return self._expr.session @property @@ -3171,7 +3172,7 @@ def unpivot( def _pd_index_to_array_value( - session: core.Session, + session: session.Session, index: pd.Index, ) -> core.ArrayValue: """ diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 5f9fcb257e..f619cd72c9 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -22,8 +22,8 @@ import jellyfish import pandas as pd +from bigframes import session from bigframes.core import log_adapter -import bigframes.core as core import bigframes.core.block_transforms as block_ops import bigframes.core.blocks as blocks import bigframes.core.expression @@ -76,7 +76,7 @@ def __init__( ] @property - def _session(self) -> core.Session: + def _session(self) -> session.Session: return self._block.session def __getitem__( @@ -492,7 +492,7 @@ def _aggregate_all( def _apply_window_op( self, op: agg_ops.WindowOp, - window: typing.Optional[core.WindowSpec] = None, + window: typing.Optional[window_specs.WindowSpec] = None, numeric_only: bool = False, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" @@ -536,7 +536,7 @@ def __init__( self._dropna = dropna # Applies to aggregations but not windowing @property - def _session(self) -> core.Session: + def _session(self) -> session.Session: return self._block.session @validations.requires_ordering() @@ -759,7 +759,7 @@ def _apply_window_op( self, op: agg_ops.WindowOp, discard_name=False, - window: typing.Optional[core.WindowSpec] = None, + window: typing.Optional[window_specs.WindowSpec] = None, never_skip_nulls: bool = False, ): """Apply window op to groupby. Defaults to grouped cumulative window.""" diff --git a/bigframes/core/nodes.py b/bigframes/core/nodes.py index 88e084d79c..d5273e5c0a 100644 --- a/bigframes/core/nodes.py +++ b/bigframes/core/nodes.py @@ -24,10 +24,9 @@ import google.cloud.bigquery as bq +from bigframes.core import identifiers import bigframes.core.expression as ex import bigframes.core.guid -import bigframes.core.identifiers -import bigframes.core.identifiers as bfet_ids from bigframes.core.ordering import OrderingExpression import bigframes.core.schema as schemata import bigframes.core.slices as slices @@ -42,14 +41,14 @@ # A fixed number of variable to assume for overhead on some operations OVERHEAD_VARIABLES = 5 -COLUMN_SET = frozenset[bfet_ids.ColumnId] +COLUMN_SET = frozenset[identifiers.ColumnId] Self = TypeVar("Self") @dataclasses.dataclass(frozen=True) class Field: - id: bfet_ids.ColumnId + id: identifiers.ColumnId dtype: bigframes.dtypes.Dtype @@ -91,21 +90,21 @@ def row_count(self) -> typing.Optional[int]: @abc.abstractmethod def remap_vars( - self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self: Self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> Self: """Remap defined (in this node only) variables.""" ... @abc.abstractmethod def remap_refs( - self: Self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self: Self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> Self: """Remap variable references""" ... @property @abc.abstractmethod - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: """The variables defined in this node (as opposed to by child nodes).""" ... @@ -179,7 +178,7 @@ def fields(self) -> Iterable[Field]: ... @property - def ids(self) -> Iterable[bfet_ids.ColumnId]: + def ids(self) -> Iterable[identifiers.ColumnId]: """All output ids from the node.""" return (field.id for field in self.fields) @@ -275,7 +274,7 @@ def defines_namespace(self) -> bool: """ return False - def get_type(self, id: bfet_ids.ColumnId) -> bigframes.dtypes.Dtype: + def get_type(self, id: identifiers.ColumnId) -> bigframes.dtypes.Dtype: return self._dtype_lookup[id] @functools.cached_property @@ -396,7 +395,7 @@ def row_count(self) -> typing.Optional[int]: ) @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @property @@ -404,12 +403,12 @@ def referenced_ids(self) -> COLUMN_SET: return frozenset() def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> SliceNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> SliceNode: return self @@ -426,7 +425,7 @@ class InNode(BigFrameNode, AdditiveNode): right_child: BigFrameNode left_col: ex.DerefOp right_col: ex.DerefOp - indicator_col: bfet_ids.ColumnId + indicator_col: identifiers.ColumnId def _validate(self): assert not ( @@ -479,7 +478,7 @@ def row_count(self) -> Optional[int]: return self.left_child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.indicator_col,) @property @@ -503,14 +502,14 @@ def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> InNod return transformed def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> InNode: return dataclasses.replace( self, indicator_col=mappings.get(self.indicator_col, self.indicator_col) ) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> InNode: return dataclasses.replace(self, left_col=self.left_col.remap_column_refs(mappings, allow_partial_bindings=True), right_col=self.right_col.remap_column_refs(mappings, allow_partial_bindings=True)) # type: ignore @@ -571,7 +570,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @property @@ -597,12 +596,12 @@ def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> JoinN return transformed def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> JoinNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> JoinNode: new_conds = tuple( ( @@ -618,7 +617,7 @@ def remap_refs( class ConcatNode(BigFrameNode): # TODO: Explcitly map column ids from each child children: Tuple[BigFrameNode, ...] - output_ids: Tuple[bfet_ids.ColumnId, ...] + output_ids: Tuple[identifiers.ColumnId, ...] def _validate(self): if len(self.children) == 0: @@ -664,7 +663,7 @@ def row_count(self) -> Optional[int]: return total @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return self.output_ids def transform_children( @@ -679,13 +678,13 @@ def transform_children( return transformed def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ConcatNode: new_ids = tuple(mappings.get(id, id) for id in self.output_ids) return dataclasses.replace(self, output_ids=new_ids) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ConcatNode: return self @@ -696,7 +695,7 @@ class FromRangeNode(BigFrameNode): start: BigFrameNode end: BigFrameNode step: int - output_id: bfet_ids.ColumnId = bfet_ids.ColumnId("labels") + output_id: identifiers.ColumnId = identifiers.ColumnId("labels") @property def roots(self) -> typing.Set[BigFrameNode]: @@ -728,7 +727,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.output_id,) @property @@ -745,14 +744,14 @@ def transform_children( return transformed def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FromRangeNode: return dataclasses.replace( self, output_id=mappings.get(self.output_id, self.output_id) ) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FromRangeNode: return self @@ -779,11 +778,11 @@ def transform_children(self, t: Callable[[BigFrameNode], BigFrameNode]) -> LeafN class ScanItem(typing.NamedTuple): - id: bfet_ids.ColumnId + id: identifiers.ColumnId dtype: bigframes.dtypes.Dtype # Might be multiple logical types for a given physical source type source_id: str # Flexible enough for both local data and bq data - def with_id(self, id: bfet_ids.ColumnId) -> ScanItem: + def with_id(self, id: identifiers.ColumnId) -> ScanItem: return ScanItem(id, self.dtype, self.source_id) @@ -839,11 +838,11 @@ def row_count(self) -> typing.Optional[int]: return self.n_rows @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(item.id for item in self.fields) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ReadLocalNode: new_scan_list = ScanList( tuple( @@ -861,7 +860,7 @@ def remap_vars( ) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ReadLocalNode: return self @@ -987,11 +986,11 @@ def row_count(self) -> typing.Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(item.id for item in self.scan_list.items) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ReadTableNode: new_scan_list = ScanList( tuple( @@ -1002,7 +1001,7 @@ def remap_vars( return dataclasses.replace(self, scan_list=new_scan_list) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ReadTableNode: return self @@ -1015,7 +1014,7 @@ def with_order_cols(self): scan_cols = {col.source_id for col in self.scan_list.items} new_scan_cols = [ ScanItem( - bigframes.core.ids.ColumnId.unique(), + identifiers.ColumnId.unique(), dtype=bigframes.dtypes.convert_schema_field(field)[1], source_id=field.name, ) @@ -1024,10 +1023,7 @@ def with_order_cols(self): ] new_scan_list = ScanList(items=(*self.scan_list.items, *new_scan_cols)) new_order = self.source.ordering.remap_column_refs( - { - bigframes.core.ids.ColumnId(item.source_id): item.id - for item in new_scan_cols - }, + {identifiers.ColumnId(item.source_id): item.id for item in new_scan_cols}, allow_partial_bindings=True, ) return dataclasses.replace(self, scan_list=new_scan_list), new_order @@ -1068,7 +1064,7 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.col_id,) @property @@ -1087,12 +1083,12 @@ def replace_additive_base(self, node: BigFrameNode) -> PromoteOffsetsNode: return dataclasses.replace(self, child=node) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> PromoteOffsetsNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> PromoteOffsetsNode: return self @@ -1114,7 +1110,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @property @@ -1126,12 +1122,12 @@ def referenced_ids(self) -> COLUMN_SET: return frozenset(self.predicate.column_references) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FilterNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> FilterNode: return dataclasses.replace( self, @@ -1166,7 +1162,7 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @property @@ -1180,12 +1176,12 @@ def referenced_ids(self) -> COLUMN_SET: ) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> OrderByNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> OrderByNode: all_refs = set( itertools.chain.from_iterable(map(lambda x: x.referenced_columns, self.by)) @@ -1220,7 +1216,7 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @property @@ -1228,31 +1224,31 @@ def referenced_ids(self) -> COLUMN_SET: return frozenset() def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ReversedNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ReversedNode: return self class AliasedRef(typing.NamedTuple): ref: ex.DerefOp - id: bfet_ids.ColumnId + id: identifiers.ColumnId @classmethod - def identity(cls, id: bfet_ids.ColumnId) -> AliasedRef: + def identity(cls, id: identifiers.ColumnId) -> AliasedRef: return cls(ex.DerefOp(id), id) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> AliasedRef: return AliasedRef(self.ref, mappings.get(self.id, self.id)) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> AliasedRef: return AliasedRef(ex.DerefOp(mappings.get(self.ref.id, self.ref.id)), self.id) @@ -1290,18 +1286,18 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(id for _, id in self.input_output_pairs) @property def consumed_ids(self) -> COLUMN_SET: return frozenset(ref.id for ref, id in self.input_output_pairs) - def get_id_mapping(self) -> dict[bfet_ids.ColumnId, bfet_ids.ColumnId]: + def get_id_mapping(self) -> dict[identifiers.ColumnId, identifiers.ColumnId]: return {ref.id: id for ref, id in self.input_output_pairs} def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> SelectionNode: new_fields = tuple( item.remap_vars(mappings) for item in self.input_output_pairs @@ -1309,7 +1305,7 @@ def remap_vars( return dataclasses.replace(self, input_output_pairs=new_fields) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> SelectionNode: new_fields = tuple( item.remap_refs(mappings) for item in self.input_output_pairs @@ -1356,7 +1352,7 @@ def row_count(self) -> Optional[int]: return self.child.row_count @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(id for _, id in self.assignments) @property @@ -1383,13 +1379,13 @@ def replace_additive_base(self, node: BigFrameNode) -> ProjectionNode: return dataclasses.replace(self, child=node) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ProjectionNode: new_fields = tuple((ex, mappings.get(id, id)) for ex, id in self.assignments) return dataclasses.replace(self, assignments=new_fields) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ProjectionNode: new_fields = tuple( (ex.remap_column_refs(mappings, allow_partial_bindings=True), id) @@ -1402,7 +1398,7 @@ def remap_refs( # Row count can be compute from table metadata sometimes, so it is a bit special. @dataclasses.dataclass(frozen=True, eq=False) class RowCountNode(UnaryNode): - col_id: bfet_ids.ColumnId = bfet_ids.ColumnId("count") + col_id: identifiers.ColumnId = identifiers.ColumnId("count") @property def row_preserving(self) -> bool: @@ -1429,7 +1425,7 @@ def row_count(self) -> Optional[int]: return 1 @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.col_id,) @property @@ -1437,12 +1433,12 @@ def consumed_ids(self) -> COLUMN_SET: return frozenset() def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> RowCountNode: return dataclasses.replace(self, col_id=mappings.get(self.col_id, self.col_id)) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> RowCountNode: return self @@ -1499,7 +1495,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return tuple(id for _, id in self.aggregations) @property @@ -1520,13 +1516,13 @@ def has_ordered_ops(self) -> bool: ) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> AggregateNode: new_aggs = tuple((agg, mappings.get(id, id)) for agg, id in self.aggregations) return dataclasses.replace(self, aggregations=new_aggs) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> AggregateNode: new_aggs = tuple( (agg.remap_column_refs(mappings, allow_partial_bindings=True), id) @@ -1589,7 +1585,7 @@ def added_field(self) -> Field: ) @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.output_name,) @property @@ -1622,14 +1618,14 @@ def replace_additive_base(self, node: BigFrameNode) -> WindowOpNode: return dataclasses.replace(self, child=node) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> WindowOpNode: return dataclasses.replace( self, output_name=mappings.get(self.output_name, self.output_name) ) def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> WindowOpNode: return dataclasses.replace( self, @@ -1663,7 +1659,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return () @property @@ -1671,12 +1667,12 @@ def referenced_ids(self) -> COLUMN_SET: return frozenset() def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> RandomSampleNode: return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> RandomSampleNode: return self @@ -1724,7 +1720,7 @@ def row_count(self) -> Optional[int]: return None @property - def node_defined_ids(self) -> Tuple[bfet_ids.ColumnId, ...]: + def node_defined_ids(self) -> Tuple[identifiers.ColumnId, ...]: return (self.offsets_col,) if (self.offsets_col is not None) else () @property @@ -1732,14 +1728,14 @@ def referenced_ids(self) -> COLUMN_SET: return frozenset(ref.id for ref in self.column_ids) def remap_vars( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ExplodeNode: if (self.offsets_col is not None) and self.offsets_col in mappings: return dataclasses.replace(self, offsets_col=mappings[self.offsets_col]) return self def remap_refs( - self, mappings: Mapping[bfet_ids.ColumnId, bfet_ids.ColumnId] + self, mappings: Mapping[identifiers.ColumnId, identifiers.ColumnId] ) -> ExplodeNode: new_ids = tuple(id.remap_column_refs(mappings) for id in self.column_ids) return dataclasses.replace(self, column_ids=new_ids) # type: ignore diff --git a/bigframes/core/rewrite/order.py b/bigframes/core/rewrite/order.py index 18e5004e1d..bdb30fbc34 100644 --- a/bigframes/core/rewrite/order.py +++ b/bigframes/core/rewrite/order.py @@ -15,12 +15,13 @@ import functools from typing import Mapping, Tuple +from bigframes.core import identifiers import bigframes.core.expression -import bigframes.core.identifiers import bigframes.core.nodes import bigframes.core.ordering import bigframes.core.window_spec import bigframes.operations +from bigframes.operations import aggregations as agg_ops # Makes ordering explicit in window definitions @@ -54,12 +55,12 @@ def pull_up_order_inner( new_node, child_order = pull_up_order_inner(node.child) new_by = [] - ids: list[bigframes.core.ids.ColumnId] = [] + ids: list[identifiers.ColumnId] = [] for part in node.by: if not isinstance( part.scalar_expression, bigframes.core.expression.DerefOp ): - id = bigframes.core.ids.ColumnId.unique() + id = identifiers.ColumnId.unique() new_node = bigframes.core.nodes.ProjectionNode( new_node, ((part.scalar_expression, id),) ) @@ -114,7 +115,7 @@ def pull_up_order_inner( ) elif isinstance(node, bigframes.core.nodes.ReadLocalNode): if node.offsets_col is None: - offsets_id = bigframes.core.ids.ColumnId.unique() + offsets_id = identifiers.ColumnId.unique() new_root = dataclasses.replace(node, offsets_col=offsets_id) return new_root, bigframes.core.ordering.TotalOrdering.from_offset_col( offsets_id @@ -145,7 +146,7 @@ def pull_up_order_inner( else: # Otherwise we need to generate offsets agg = bigframes.core.expression.NullaryAggregation( - bigframes.core.agg_ops.RowNumberOp() + agg_ops.RowNumberOp() ) window_spec = bigframes.core.window_spec.unbound( ordering=tuple(child_order.all_ordering_columns) @@ -177,8 +178,7 @@ def pull_up_order_inner( ) # Create unique ids just to be safe new_selections = { - col: bigframes.core.ids.ColumnId.unique() - for col in unselected_order_cols + col: identifiers.ColumnId.unique() for col in unselected_order_cols } all_selections = node.input_output_pairs + tuple( bigframes.core.nodes.AliasedRef(bigframes.core.expression.DerefOp(k), v) @@ -240,14 +240,14 @@ def pull_up_order_inner( elif isinstance(node, bigframes.core.nodes.ExplodeNode): child_result, child_order = pull_up_order_inner(node.child) if node.offsets_col is None: - offsets_id = bigframes.core.ids.ColumnId.unique() + offsets_id = identifiers.ColumnId.unique() new_explode: bigframes.core.nodes.BigFrameNode = dataclasses.replace( node, child=child_result, offsets_col=offsets_id ) else: offsets_id = node.offsets_col new_explode = node.replace_child(child_result) - inner_order = bigframes.core.orderings.TotalOrdering.from_offset_col( + inner_order = bigframes.core.ordering.TotalOrdering.from_offset_col( offsets_id ) return new_explode, child_order.join(inner_order) @@ -261,8 +261,8 @@ def pull_order_concat( new_sources = [] for i, source in enumerate(node.child_nodes): new_source, order = pull_up_order_inner(source) - offsets_id = bigframes.core.ids.ColumnId.unique() - table_id = bigframes.core.ids.ColumnId.unique() + offsets_id = identifiers.ColumnId.unique() + table_id = identifiers.ColumnId.unique() if order.is_total_ordering and order.integer_encoding.is_encoded: order_expression = order.total_order_col assert order_expression is not None @@ -271,7 +271,7 @@ def pull_order_concat( ) else: agg = bigframes.core.expression.NullaryAggregation( - bigframes.core.agg_ops.RowNumberOp() + agg_ops.RowNumberOp() ) window_spec = bigframes.core.window_spec.unbound( ordering=tuple(order.all_ordering_columns) @@ -291,8 +291,8 @@ def pull_order_concat( new_source = bigframes.core.nodes.SelectionNode(new_source, selection) new_sources.append(new_source) - union_offsets_id = bigframes.core.ids.ColumnId.unique() - union_table_id = bigframes.core.ids.ColumnId.unique() + union_offsets_id = identifiers.ColumnId.unique() + union_table_id = identifiers.ColumnId.unique() new_ids = (*node.output_ids, union_table_id, union_offsets_id) new_node = dataclasses.replace( node, children=tuple(new_sources), output_ids=new_ids @@ -317,7 +317,7 @@ def pull_order_join( if node.type in ("right", "outer"): # right side is nullable - left_indicator = bigframes.core.ids.ColumnId.unique() + left_indicator = identifiers.ColumnId.unique() left_child = bigframes.core.nodes.ProjectionNode( left_child, ((bigframes.core.expression.const(True), left_indicator),) ) @@ -326,7 +326,7 @@ def pull_order_join( ) if node.type in ("left", "outer"): # right side is nullable - right_indicator = bigframes.core.ids.ColumnId.unique() + right_indicator = identifiers.ColumnId.unique() right_child = bigframes.core.nodes.ProjectionNode( right_child, ((bigframes.core.expression.const(True), right_indicator),) ) @@ -406,20 +406,18 @@ def remove_order_strict( def rewrite_promote_offsets( node: bigframes.core.nodes.PromoteOffsetsNode, ) -> bigframes.core.nodes.WindowOpNode: - agg = bigframes.core.expression.NullaryAggregation( - bigframes.core.agg_ops.RowNumberOp() - ) + agg = bigframes.core.expression.NullaryAggregation(agg_ops.RowNumberOp()) window_spec = bigframes.core.window_spec.unbound() return bigframes.core.nodes.WindowOpNode(node.child, agg, window_spec, node.col_id) def rename_cols( - node: bigframes.core.nodes.BigFrameNode, cols: set[bigframes.core.ids.ColumnId] + node: bigframes.core.nodes.BigFrameNode, cols: set[identifiers.ColumnId] ) -> Tuple[ bigframes.core.nodes.BigFrameNode, - Mapping[bigframes.core.ids.ColumnId, bigframes.core.ids.ColumnId], + Mapping[identifiers.ColumnId, identifiers.ColumnId], ]: - mappings = dict((id, bigframes.core.ids.ColumnId.unique()) for id in cols) + mappings = dict((id, identifiers.ColumnId.unique()) for id in cols) result_node = bigframes.core.nodes.SelectionNode( node, diff --git a/bigframes/core/rewrite/pruning.py b/bigframes/core/rewrite/pruning.py index 0b8534116d..7e40137f3e 100644 --- a/bigframes/core/rewrite/pruning.py +++ b/bigframes/core/rewrite/pruning.py @@ -15,7 +15,7 @@ import functools from typing import AbstractSet -import bigframes.core.identifiers +from bigframes.core import identifiers import bigframes.core.nodes @@ -128,7 +128,7 @@ def prune_selection_child( def prune_node( node: bigframes.core.nodes.BigFrameNode, - ids: AbstractSet[bigframes.core.ids.ColumnId], + ids: AbstractSet[identifiers.ColumnId], ): # This clause is important, ensures idempotency, so can reach fixed point if not (set(node.ids) - ids): @@ -146,7 +146,7 @@ def prune_node( def prune_aggregate( node: bigframes.core.nodes.AggregateNode, - used_cols: AbstractSet[bigframes.core.ids.ColumnId], + used_cols: AbstractSet[identifiers.ColumnId], ) -> bigframes.core.nodes.AggregateNode: pruned_aggs = tuple(agg for agg in node.aggregations if agg[1] in used_cols) return dataclasses.replace(node, aggregations=pruned_aggs) @@ -155,7 +155,7 @@ def prune_aggregate( @functools.singledispatch def prune_leaf( node: bigframes.core.nodes.BigFrameNode, - used_cols: AbstractSet[bigframes.core.ids.ColumnId], + used_cols: AbstractSet[identifiers.ColumnId], ): ... @@ -163,7 +163,7 @@ def prune_leaf( @prune_leaf.register def prune_readlocal( node: bigframes.core.nodes.ReadLocalNode, - selection: AbstractSet[bigframes.core.ids.ColumnId], + selection: AbstractSet[identifiers.ColumnId], ) -> bigframes.core.nodes.ReadLocalNode: new_scan_list = filter_scanlist(node.scan_list, selection) return dataclasses.replace( @@ -176,7 +176,7 @@ def prune_readlocal( @prune_leaf.register def prune_readtable( node: bigframes.core.nodes.ReadTableNode, - selection: AbstractSet[bigframes.core.ids.ColumnId], + selection: AbstractSet[identifiers.ColumnId], ) -> bigframes.core.nodes.ReadTableNode: new_scan_list = filter_scanlist(node.scan_list, selection) return dataclasses.replace(node, scan_list=new_scan_list) @@ -184,7 +184,7 @@ def prune_readtable( def filter_scanlist( scanlist: bigframes.core.nodes.ScanList, - ids: AbstractSet[bigframes.core.ids.ColumnId], + ids: AbstractSet[identifiers.ColumnId], ): result = bigframes.core.nodes.ScanList( tuple(item for item in scanlist.items if item.id in ids) diff --git a/bigframes/core/window/__init__.py b/bigframes/core/window/__init__.py index 2b45560b15..7758145fd4 100644 --- a/bigframes/core/window/__init__.py +++ b/bigframes/core/window/__init__.py @@ -18,8 +18,7 @@ import bigframes_vendored.pandas.core.window.rolling as vendored_pandas_rolling -from bigframes.core import log_adapter -import bigframes.core as core +from bigframes.core import log_adapter, window_spec import bigframes.core.blocks as blocks import bigframes.operations.aggregations as agg_ops @@ -31,7 +30,7 @@ class Window(vendored_pandas_rolling.Window): def __init__( self, block: blocks.Block, - window_spec: core.WindowSpec, + window_spec: window_spec.WindowSpec, value_column_ids: typing.Sequence[str], drop_null_groups: bool = True, is_series: bool = False, From ffe7dc6bba8cfd95be2489eac603f1ac5554a7ef Mon Sep 17 00:00:00 2001 From: Shuowei Li Date: Mon, 10 Feb 2025 17:16:28 -0800 Subject: [PATCH 20/22] chore: add experimental blob.pdf_chunking function (#1370) * pdf chunking code is done * pdf chunking is working, currently, we save all chunked results back to GCS. We will change it for the next round * pdf chunking, it takes a GCS link as input, and write chunked output into a bigquery table * refactor code * move the import blob place to fix bugs --------- Co-authored-by: Shuowei Li --- bigframes/blob/_functions.py | 74 +++++++++++++++++ bigframes/operations/blob.py | 152 ++++++++++++++++++++++++++++++++--- 2 files changed, 215 insertions(+), 11 deletions(-) diff --git a/bigframes/blob/_functions.py b/bigframes/blob/_functions.py index 397a37ee92..a05030140e 100644 --- a/bigframes/blob/_functions.py +++ b/bigframes/blob/_functions.py @@ -128,3 +128,77 @@ def image_blur_func( image_blur_def = FunctionDef(image_blur_func, ["opencv-python", "numpy", "requests"]) + + +# Extracts all text from a PDF url +def pdf_extract_func(src_obj_ref_rt: str) -> str: + import io + import json + + from pypdf import PdfReader # type: ignore + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url, stream=True) + response.raise_for_status() + pdf_bytes = response.content + + pdf_file = io.BytesIO(pdf_bytes) + reader = PdfReader(pdf_file, strict=False) + + all_text = "" + for page in reader.pages: + page_extract_text = page.extract_text() + if page_extract_text: + all_text += page_extract_text + return all_text + + +pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests"]) + + +# Extracts text from a PDF url and chunks it simultaneously +def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> str: + import io + import json + + from pypdf import PdfReader # type: ignore + import requests + + src_obj_ref_rt_json = json.loads(src_obj_ref_rt) + src_url = src_obj_ref_rt_json["access_urls"]["read_url"] + + response = requests.get(src_url, stream=True) + response.raise_for_status() + pdf_bytes = response.content + + pdf_file = io.BytesIO(pdf_bytes) + reader = PdfReader(pdf_file, strict=False) + + # extract and chunk text simultaneously + all_text_chunks = [] + curr_chunk = "" + for page in reader.pages: + page_text = page.extract_text() + if page_text: + curr_chunk += page_text + # split the accumulated text into chunks of a specific size with overlaop + # this loop implements a sliding window approach to create chunks + while len(curr_chunk) >= chunk_size: + split_idx = curr_chunk.rfind(" ", 0, chunk_size) + if split_idx == -1: + split_idx = chunk_size + actual_chunk = curr_chunk[:split_idx] + all_text_chunks.append(actual_chunk) + overlap = curr_chunk[split_idx + 1 : split_idx + 1 + overlap_size] + curr_chunk = overlap + curr_chunk[split_idx + 1 + overlap_size :] + if curr_chunk: + all_text_chunks.append(curr_chunk) + + all_text_json_string = json.dumps(all_text_chunks) + return all_text_json_string + + +pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests"]) diff --git a/bigframes/operations/blob.py b/bigframes/operations/blob.py index 205a9fcf5c..7fa4dd9633 100644 --- a/bigframes/operations/blob.py +++ b/bigframes/operations/blob.py @@ -224,6 +224,54 @@ def display_single_url(https://github.com/read_url:%20str,%20content_type:%20str): for _, row in df.iterrows(): display_single_url(https://github.com/row[%22read_url%22],%20row[%22content_type%22]) + def _resolve_connection(self, connection: Optional[str] = None) -> str: + """Resovle the BigQuery connection. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" is + str. If None, uses default connection of the session. + + Returns: + str: the resolved BigQuery connection string in the format: + "project.location.connection_id". + + Raises: + ValueError: If the connection cannot be resolved to a valid string. + """ + connection = connection or self._block.session._bq_connection + return clients.resolve_full_bq_connection_name( + connection, + default_project=self._block.session._project, + default_location=self._block.session._location, + ) + + def _get_runtime_json_str( + self, mode: str = "R", with_metadata: bool = False + ) -> bigframes.series.Series: + """Get the runtime and apply the ToJSONSTring transformation. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + mode(str or str, default "R"): the mode for accessing the runtime. + Default to "R". Possible values are "R" (read-only) and + "RW" (read-write) + with_metadata (bool, default False): whether to include metadata + in the JOSN string. Default to False. + + Returns: + str: the runtime object in the JSON string. + """ + runtime = self._get_runtime(mode=mode, with_metadata=with_metadata) + return runtime._apply_unary_op(ops.ToJSONString()) + def image_blur( self, ksize: tuple[int, int], @@ -246,12 +294,7 @@ def image_blur( """ import bigframes.blob._functions as blob_func - connection = connection or self._block.session._bq_connection - connection = clients.resolve_full_bq_connection_name( - connection, - default_project=self._block.session._project, - default_location=self._block.session._location, - ) + connection = self._resolve_connection(connection) if isinstance(dst, str): dst = os.path.join(dst, "") @@ -268,11 +311,8 @@ def image_blur( connection=connection, ).udf() - src_rt = self._get_runtime(mode="R") - dst_rt = dst.blob._get_runtime(mode="RW") - - src_rt = src_rt._apply_unary_op(ops.ToJSONString()) - dst_rt = dst_rt._apply_unary_op(ops.ToJSONString()) + src_rt = self._get_runtime_json_str(mode="R") + dst_rt = dst.blob._get_runtime_json_str(mode="RW") df = src_rt.to_frame().join(dst_rt.to_frame(), how="outer") df["ksize_x"], df["ksize_y"] = ksize @@ -281,3 +321,93 @@ def image_blur( res.cache() # to execute the udf return dst + + def pdf_extract( + self, *, connection: Optional[str] = None + ) -> bigframes.series.Series: + """Extracts and chunks text from PDF URLs and saves the text as + arrays of string. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + + Returns: + bigframes.series.Series: conatins all text from a pdf file + """ + + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + + pdf_chunk_udf = blob_func.TransformFunction( + blob_func.pdf_extract_def, + session=self._block.session, + connection=connection, + ).udf() + + src_rt = self._get_runtime_json_str(mode="R") + res = src_rt.apply(pdf_chunk_udf) + return res + + def pdf_chunk( + self, + *, + connection: Optional[str] = None, + chunk_size: int = 1000, + overlap_size: int = 200, + ) -> bigframes.series.Series: + """Extracts and chunks text from PDF URLs and saves the text as + arrays of strings. + + .. note:: + BigFrames Blob is still under experiments. It may not work and + subject to change in the future. + + Args: + connection (str or None, default None): BQ connection used for + function internet transactions, and the output blob if "dst" + is str. If None, uses default connection of the session. + chunk_size (int, default 1000): the desired size of each text chunk + (number of characters). + overlap_size (int, default 200): the number of overlapping characters + between consective chunks. The helps to ensure context is + perserved across chunk boundaries. + + Returns: + bigframe.series.Series of array[str], where each string is a + chunk of text extracted from PDF. + """ + + import bigframes.bigquery as bbq + import bigframes.blob._functions as blob_func + + connection = self._resolve_connection(connection) + + if chunk_size <= 0: + raise ValueError("chunk_size must be a positive integer.") + if overlap_size < 0: + raise ValueError("overlap_size must be a non-negative integer.") + if overlap_size >= chunk_size: + raise ValueError("overlap_size must be smaller than chunk_size.") + + pdf_chunk_udf = blob_func.TransformFunction( + blob_func.pdf_chunk_def, + session=self._block.session, + connection=connection, + ).udf() + + src_rt = self._get_runtime_json_str(mode="R") + df = src_rt.to_frame() + df["chunk_size"] = chunk_size + df["overlap_size"] = overlap_size + + res = df.apply(pdf_chunk_udf, axis=1) + + res_array = bbq.json_extract_string_array(res) + return res_array From eff964b65adff6e24380b9f468c85d5592d11081 Mon Sep 17 00:00:00 2001 From: Shenyang Cai Date: Mon, 10 Feb 2025 18:10:04 -0800 Subject: [PATCH 21/22] refactor: rename rewrite.operators file to rewrite.timedeltas (#1382) --- bigframes/core/compile/compiler.py | 8 ++++---- bigframes/core/rewrite/__init__.py | 4 ++-- bigframes/core/rewrite/{operators.py => timedeltas.py} | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) rename bigframes/core/rewrite/{operators.py => timedeltas.py} (97%) diff --git a/bigframes/core/compile/compiler.py b/bigframes/core/compile/compiler.py index ff5f1d61c8..64a0ae265f 100644 --- a/bigframes/core/compile/compiler.py +++ b/bigframes/core/compile/compiler.py @@ -58,7 +58,7 @@ def compile_sql( # TODO: get rid of output_ids arg assert len(output_ids) == len(list(node.fields)) node = set_output_names(node, output_ids) - node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) if ordered: node, limit = rewrites.pullup_limit_from_slice(node) node = nodes.bottom_up(node, rewrites.rewrite_slice) @@ -85,7 +85,7 @@ def compile_sql( def compile_peek_sql(self, node: nodes.BigFrameNode, n_rows: int) -> str: ids = [id.sql for id in node.ids] node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) @@ -99,7 +99,7 @@ def compile_raw( str, typing.Sequence[google.cloud.bigquery.SchemaField], bf_ordering.RowOrdering ]: node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) node, ordering = rewrites.pull_up_order(node, ordered_joins=self.strict) node = rewrites.column_pruning(node) ir = self.compile_node(node) @@ -108,7 +108,7 @@ def compile_raw( def _preprocess(self, node: nodes.BigFrameNode): node = nodes.bottom_up(node, rewrites.rewrite_slice) - node = nodes.top_down(node, rewrites.rewrite_timedelta_ops) + node = nodes.top_down(node, rewrites.rewrite_timedelta_expressions) node, _ = rewrites.pull_up_order( node, order_root=False, ordered_joins=self.strict ) diff --git a/bigframes/core/rewrite/__init__.py b/bigframes/core/rewrite/__init__.py index bf93fa51b6..e5f7578911 100644 --- a/bigframes/core/rewrite/__init__.py +++ b/bigframes/core/rewrite/__init__.py @@ -15,16 +15,16 @@ from bigframes.core.rewrite.identifiers import remap_variables from bigframes.core.rewrite.implicit_align import try_row_join from bigframes.core.rewrite.legacy_align import legacy_join_as_projection -from bigframes.core.rewrite.operators import rewrite_timedelta_ops from bigframes.core.rewrite.order import pull_up_order from bigframes.core.rewrite.pruning import column_pruning from bigframes.core.rewrite.slices import pullup_limit_from_slice, rewrite_slice +from bigframes.core.rewrite.timedeltas import rewrite_timedelta_expressions __all__ = [ "legacy_join_as_projection", "try_row_join", "rewrite_slice", - "rewrite_timedelta_ops", + "rewrite_timedelta_expressions", "pullup_limit_from_slice", "remap_variables", "pull_up_order", diff --git a/bigframes/core/rewrite/operators.py b/bigframes/core/rewrite/timedeltas.py similarity index 97% rename from bigframes/core/rewrite/operators.py rename to bigframes/core/rewrite/timedeltas.py index 136e9cc220..d740b28d7d 100644 --- a/bigframes/core/rewrite/operators.py +++ b/bigframes/core/rewrite/timedeltas.py @@ -28,7 +28,7 @@ class _TypedExpr: dtype: dtypes.Dtype -def rewrite_timedelta_ops(root: nodes.BigFrameNode) -> nodes.BigFrameNode: +def rewrite_timedelta_expressions(root: nodes.BigFrameNode) -> nodes.BigFrameNode: """ Rewrites expressions to properly handle timedelta values, because this type does not exist in the SQL world. From 641abead628a47a15fd54f785db78fd69797a465 Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 11 Feb 2025 06:26:41 -0600 Subject: [PATCH 22/22] chore(main): release 1.36.0 (#1360) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> Co-authored-by: Huan Chen <142538604+Genesis929@users.noreply.github.com> --- CHANGELOG.md | 20 ++++++++++++++++++++ bigframes/version.py | 2 +- third_party/bigframes_vendored/version.py | 2 +- 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index af87cae3b2..a8ebb7a417 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,26 @@ [1]: https://pypi.org/project/bigframes/#history +## [1.36.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.35.0...v1.36.0) (2025-02-11) + + +### Features + +* Add `bigframes.bigquery.st_area` and suggest it from `GeoSeries.area` ([#1318](https://github.com/googleapis/python-bigquery-dataframes/issues/1318)) ([8b5ffa8](https://github.com/googleapis/python-bigquery-dataframes/commit/8b5ffa8893b51016c51794865c40def74ea6716b)) +* Add `GeoSeries.from_xy()` ([#1364](https://github.com/googleapis/python-bigquery-dataframes/issues/1364)) ([3c3e14c](https://github.com/googleapis/python-bigquery-dataframes/commit/3c3e14c715f476ca44f254c0d53d639ea5988a8d)) + + +### Bug Fixes + +* Dtype parameter ineffective in Series/DataFrame construction ([#1354](https://github.com/googleapis/python-bigquery-dataframes/issues/1354)) ([b9bdca8](https://github.com/googleapis/python-bigquery-dataframes/commit/b9bdca8285ee54fecf3795fbf3cbea6f878ee8ca)) +* Translate labels to col ids when copying dataframes ([#1372](https://github.com/googleapis/python-bigquery-dataframes/issues/1372)) ([0c55b07](https://github.com/googleapis/python-bigquery-dataframes/commit/0c55b07dc001b568875f06d578ca7d59409f2a11)) + + +### Performance Improvements + +* Prune unused operations from sql ([#1365](https://github.com/googleapis/python-bigquery-dataframes/issues/1365)) ([923da03](https://github.com/googleapis/python-bigquery-dataframes/commit/923da037ef6e4e7f8b54924ea5644c2c5ceb2234)) +* Simplify merge join key coalescing ([#1361](https://github.com/googleapis/python-bigquery-dataframes/issues/1361)) ([7ae565d](https://github.com/googleapis/python-bigquery-dataframes/commit/7ae565d9e0e59fdf75c7659c0263562688ccc1e8)) + ## [1.35.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v1.34.0...v1.35.0) (2025-02-04) diff --git a/bigframes/version.py b/bigframes/version.py index d9b9875805..e92072bea8 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.35.0" +__version__ = "1.36.0" diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index d9b9875805..e92072bea8 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.35.0" +__version__ = "1.36.0"