From a32b74751785c8e8aec40ce01df639dd7c4fbb77 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Thu, 24 Aug 2023 11:04:40 -0500 Subject: [PATCH 1/7] chore: sync latest changes from internal repo (#7) docs: highlight bigframes is open-source docs: correct the return types of Dataframe and Series docs: create subfolders for notebooks feat: add `bigframes.get_global_session()` and `bigframes.reset_session()` aliases chore: mark ml.llm tests flaky chore: make kokoro/build.sh executable feat: add `Series.str` methods `isalpha`, `isdigit`, `isdecimal`, `isalnum`, `isspace`, `islower`, `isupper`, `zfill`, `center` chore: pin max pytest-retry plugin version in tests docs: sample ML Drug Name Generation notebook docs: add samples and best practices to `read_gbq` docs chore: fix Python download path in docs-presubmit tests perf: add local cache for `__repr_*__` methods feat: support `DataFrame.pivot` fix: don't use query cache for Session construction feat: add `bigframes.pandas.read_pickle` function feat: support MultiIndex for DataFrame columns chore: change the docs kokoro setup to Gerrit path docs: transform remote function user guide into sample code fix: raise exception for invalid function in `read_gbq_function` docs: add release status to table of contents feat: add `fit_transform` to `bigquery.ml` transformers feat: use `pandas.Index` for column labels docs: add ML section under Overview fix: check that types are specified in `read_gbq_function` fix: add error message to `set_index` --- .kokoro/build.sh | 0 .kokoro/docker/docs/Dockerfile | 13 +- .kokoro/docs/common.cfg | 4 +- .kokoro/docs/docs-presubmit.cfg | 2 +- README.rst | 126 +- bigframes/__init__.py | 5 +- bigframes/core/blocks.py | 239 +- bigframes/core/global_session.py | 65 + bigframes/core/groupby/__init__.py | 149 +- bigframes/core/indexers.py | 2 +- bigframes/dataframe.py | 120 +- bigframes/ml/base.py | 24 +- bigframes/ml/compose.py | 2 +- bigframes/ml/preprocessing.py | 4 +- bigframes/operations/__init__.py | 96 +- bigframes/operations/aggregations.py | 32 +- bigframes/operations/strings.py | 41 + bigframes/pandas/__init__.py | 138 +- bigframes/remote_function.py | 83 +- bigframes/series.py | 4 +- bigframes/session.py | 42 +- docs/index.rst | 1 - docs/templates/toc.yml | 1 + .../bigframes.pandas/remote_functions.rst | 134 -- docs/user_guide/index.rst | 9 - notebooks/00 - Summary.ipynb | 2060 ----------------- notebooks/01 - Getting Started.ipynb | 1190 ---------- .../dataframe.ipynb} | 4 +- .../longer_ml_demo.ipynb} | 0 ...q_dataframes_ml_drug_name_generation.ipynb | 980 ++++++++ .../large_language_models.ipynb} | 0 .../bq_dataframes_llm_code_generation.ipynb | 891 +++++++ .../bq_dataframes_ml_linear_regression.ipynb | 743 ++++++ .../getting_started_bq_dataframes.ipynb | 971 ++++++++ .../ml_fundamentals.ipynb} | 0 .../regionalized.ipynb} | 0 .../easy_linear_regression.ipynb} | 0 .../sklearn_linear_regression.ipynb} | 0 .../remote_function.ipynb} | 0 noxfile.py | 50 +- pytest.ini | 1 + samples/snippets/remote_function.py | 147 ++ samples/snippets/remote_function_test.py | 32 + tests/data/hockey_players.json | 37 + tests/data/hockey_players.jsonl | 10 + tests/system/conftest.py | 79 + tests/system/large/ml/test_compose.py | 53 +- tests/system/small/ml/test_llm.py | 8 + tests/system/small/ml/test_preprocessing.py | 68 +- tests/system/small/operations/test_strings.py | 125 +- tests/system/small/test_dataframe.py | 44 + tests/system/small/test_groupby.py | 4 - tests/system/small/test_ipython.py | 28 + tests/system/small/test_multiindex.py | 177 ++ tests/system/small/test_pandas_options.py | 5 +- tests/system/small/test_remote_function.py | 151 +- tests/system/small/test_session.py | 38 + tests/unit/test_dtypes.py | 7 +- .../bigframes_vendored/pandas/core/frame.py | 115 +- .../bigframes_vendored/pandas/core/series.py | 91 +- .../pandas/core/strings/accessor.py | 140 ++ .../bigframes_vendored/pandas/io/gbq.py | 51 +- .../bigframes_vendored/pandas/io/pickle.py | 55 + .../bigframes_vendored/sklearn/base.py | 23 +- .../sklearn/cluster/_kmeans.py | 2 - 65 files changed, 5909 insertions(+), 3807 deletions(-) mode change 100644 => 100755 .kokoro/build.sh create mode 100644 bigframes/core/global_session.py delete mode 100644 docs/user_guide/bigframes.pandas/remote_functions.rst delete mode 100644 docs/user_guide/index.rst delete mode 100644 notebooks/00 - Summary.ipynb delete mode 100644 notebooks/01 - Getting Started.ipynb rename notebooks/{02 - DataFrame.ipynb => dataframes/dataframe.ipynb} (99%) rename notebooks/{99 - Longer ML demo.ipynb => experimental/longer_ml_demo.ipynb} (100%) create mode 100644 notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb rename notebooks/{06 - Using ML - Large Language Models.ipynb => generative_ai/large_language_models.ipynb} (100%) create mode 100644 notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb create mode 100644 notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb create mode 100644 notebooks/getting_started/getting_started_bq_dataframes.ipynb rename notebooks/{03 - Using ML - ML fundamentals.ipynb => getting_started/ml_fundamentals.ipynb} (100%) rename notebooks/{10 - Regionalized.ipynb => location/regionalized.ipynb} (100%) rename notebooks/{05 - Using ML - Easy linear regression.ipynb => regression/easy_linear_regression.ipynb} (100%) rename notebooks/{04 - Using ML - SKLearn linear regression.ipynb => regression/sklearn_linear_regression.ipynb} (100%) rename notebooks/{50 - Remote Function.ipynb => remote_functions/remote_function.ipynb} (100%) create mode 100644 samples/snippets/remote_function.py create mode 100644 samples/snippets/remote_function_test.py create mode 100644 tests/data/hockey_players.json create mode 100644 tests/data/hockey_players.jsonl create mode 100644 tests/system/small/test_ipython.py create mode 100644 third_party/bigframes_vendored/pandas/io/pickle.py diff --git a/.kokoro/build.sh b/.kokoro/build.sh old mode 100644 new mode 100755 diff --git a/.kokoro/docker/docs/Dockerfile b/.kokoro/docker/docs/Dockerfile index d300bee260..e8ee8191ee 100644 --- a/.kokoro/docker/docs/Dockerfile +++ b/.kokoro/docker/docs/Dockerfile @@ -60,19 +60,16 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* \ && rm -f /var/cache/apt/archives/*.deb -###################### Install python 3.9.13 and 3.10.5 +###################### Install python 3.9.13 -# Download python 3.9.13 and 3.10.5 +# Download python 3.9.13 RUN wget https://www.python.org/ftp/python/3.9.13/Python-3.9.13.tgz -RUN wget https://www.python.org/ftp/python/3.9.13/Python-3.10.5.tgz # Extract files RUN tar -xvf Python-3.9.13.tgz -RUN tar -xvf Python-3.10.5.tgz -# Install python 3.9.13 and 3.10.5 +# Install python 3.9.13 RUN ./Python-3.9.13/configure --enable-optimizations -RUN ./Python-3.10.5/configure --enable-optimizations RUN make altinstall ###################### Install pip @@ -82,7 +79,5 @@ RUN wget -O /tmp/get-pip.py 'https://bootstrap.pypa.io/get-pip.py' \ # Test pip RUN python3 -m pip -RUN python3.9 -m pip -RUN python3.10 -m pip -CMD ["python3.10"] +CMD ["python3.9"] diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index 3e36916024..ce84d7ec49 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -11,7 +11,7 @@ action { gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" # Use the trampoline script to run in docker. -build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" +build_file: "bigframes/.kokoro/trampoline_v2.sh" # Configure the docker image for kokoro-trampoline. env_vars: { @@ -20,7 +20,7 @@ env_vars: { } env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/publish-docs.sh" + value: "git/bigframes/.kokoro/publish-docs.sh" } env_vars: { diff --git a/.kokoro/docs/docs-presubmit.cfg b/.kokoro/docs/docs-presubmit.cfg index 43ec87185e..1d0dc4b499 100644 --- a/.kokoro/docs/docs-presubmit.cfg +++ b/.kokoro/docs/docs-presubmit.cfg @@ -13,7 +13,7 @@ env_vars: { env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "github/python-bigquery-dataframes/.kokoro/build.sh" + value: ".kokoro/build.sh" } # Only run this nox session. diff --git a/README.rst b/README.rst index c6dbb05957..6ae3753eed 100644 --- a/README.rst +++ b/README.rst @@ -7,6 +7,9 @@ powered by the BigQuery engine. * ``bigframes.pandas`` provides a pandas-compatible API for analytics. * ``bigframes.ml`` provides a scikit-learn-like API for ML. +BigQuery DataFrames is an open-source package. You can run +``pip install --upgrade bigframes`` to install the latest version. + Documentation ------------- @@ -65,6 +68,127 @@ querying is not in the US multi-region. If you try to read a table from another location, you get a NotFound exception. +ML Capabilities +--------------- + +The ML capabilities in BigQuery DataFrames let you preprocess data, and +then train models on that data. You can also chain these actions together to +create data pipelines. + +Preprocess data +^^^^^^^^^^^^^^^^^^^^^^^^ + +Create transformers to prepare data for use in estimators (models) by +using the +`bigframes.ml.preprocessing module `_ +and the `bigframes.ml.compose module `_. +BigQuery DataFrames offers the following transformations: + +* Use the `OneHotEncoder class `_ + in the ``bigframes.ml.preprocessing`` module to transform categorical values into numeric format. +* Use the `StandardScaler class `_ + in the ``bigframes.ml.preprocessing`` module to standardize features by removing the mean and scaling to unit variance. +* Use the `ColumnTransformer class `_ + in the ``bigframes.ml.compose`` module to apply transformers to DataFrames columns. + + +Train models +^^^^^^^^^^^^ + +Create estimators to train models in BigQuery DataFrames. + +**Clustering models** + +Create estimators for clustering models by using the +`bigframes.ml.cluster module `_. + +* Use the `KMeans class `_ + to create K-means clustering models. Use these models for + data segmentation. For example, identifying customer segments. K-means is an + unsupervised learning technique, so model training doesn't require labels or split + data for training or evaluation. + +**Decomposition models** + +Create estimators for decomposition models by using the `bigframes.ml.decomposition module `_. + +* Use the `PCA class `_ + to create principal component analysis (PCA) models. Use these + models for computing principal components and using them to perform a change of + basis on the data. This provides dimensionality reduction by projecting each data + point onto only the first few principal components to obtain lower-dimensional + data while preserving as much of the data's variation as possible. + + +**Ensemble models** + +Create estimators for ensemble models by using the `bigframes.ml.ensemble module `_. + +* Use the `RandomForestClassifier class `_ + to create random forest classifier models. Use these models for constructing multiple + learning method decision trees for classification. +* Use the `RandomForestRegressor class `_ + to create random forest regression models. Use + these models for constructing multiple learning method decision trees for regression. +* Use the `XGBClassifier class `_ + to create gradient boosted tree classifier models. Use these models for additively + constructing multiple learning method decision trees for classification. +* Use the `XGBRegressor class `_ + to create gradient boosted tree regression models. Use these models for additively + constructing multiple learning method decision trees for regression. + + +**Forecasting models** + +Create estimators for forecasting models by using the `bigframes.ml.forecasting module `_. + +* Use the `ARIMAPlus class `_ + to create time series forecasting models. + +**Imported models** + +Create estimators for imported models by using the `bigframes.ml.imported module `_. + +* Use the `ONNXModel class `_ + to import Open Neural Network Exchange (ONNX) models. +* Use the `TensorFlowModel class `_ + to import TensorFlow models. + +**Linear models** + +Create estimators for linear models by using the `bigframes.ml.linear_model module `_. + +* Use the `LinearRegression class `_ + to create linear regression models. Use these models for forecasting. For example, + forecasting the sales of an item on a given day. +* Use the `LogisticRegression class `_ + to create logistic regression models. Use these models for the classification of two + or more possible values such as whether an input is ``low-value``, ``medium-value``, + or ``high-value``. + +**Large language models** + +Create estimators for LLMs by using the `bigframes.ml.llm module `_. + +* Use the `PaLM2TextGenerator class `_ to create PaLM2 text generator models. Use these models + for text generation tasks. +* Use the `PaLM2TextEmbeddingGenerator class `_ to create PaLM2 text embedding generator models. + Use these models for text embedding generation tasks. + + +Create pipelines +^^^^^^^^^^^^^^^^ + +Create ML pipelines by using +`bigframes.ml.pipeline module `_. +Pipelines let you assemble several ML steps to be cross-validated together while setting +different parameters. This simplifies your code, and allows you to deploy data preprocessing +steps and an estimator together. + +* Use the `Pipeline class `_ + to create a pipeline of transforms with a final estimator. + + ML locations ------------ @@ -181,7 +305,7 @@ following IAM roles: Quotas and limits ------------------ +------------------ `BigQuery quotas `_ including hardware, software, and network components. diff --git a/bigframes/__init__.py b/bigframes/__init__.py index 2ee745bc52..3e54a6d090 100644 --- a/bigframes/__init__.py +++ b/bigframes/__init__.py @@ -16,13 +16,16 @@ from bigframes._config import options from bigframes._config.bigquery_options import BigQueryOptions +from bigframes.core.global_session import get_global_session, reset_session from bigframes.session import connect, Session from bigframes.version import __version__ __all__ = [ + "options", "BigQueryOptions", + "get_global_session", + "reset_session", "connect", - "options", "Session", "__version__", ] diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f696b8287b..2731990feb 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -53,6 +53,10 @@ _BYTES_TO_KILOBYTES = 1024 _BYTES_TO_MEGABYTES = _BYTES_TO_KILOBYTES * 1024 +# This is the max limit of physical columns in BQ +# May choose to set smaller limit for number of block columns to allow overhead for ordering, etc. +_BQ_MAX_COLUMNS = 10000 + # All sampling method _HEAD = "head" _UNIFORM = "uniform" @@ -75,9 +79,9 @@ class Block: def __init__( self, expr: core.ArrayValue, - index_columns: Iterable[str] = (), - column_labels: Optional[Sequence[Label]] = None, - index_labels: Optional[Sequence[Label]] = None, + index_columns: Iterable[str], + column_labels: typing.Union[pd.Index, typing.Sequence[Label]], + index_labels: typing.Union[pd.Index, typing.Sequence[Label], None] = None, ): """Construct a block object, will create default index if no index columns specified.""" if index_labels and (len(index_labels) != len(list(index_columns))): @@ -88,15 +92,18 @@ def __init__( expr, new_index_col_id = expr.promote_offsets() index_columns = [new_index_col_id] self._index_columns = tuple(index_columns) + # Index labels don't need complicated hierarchical access so can store as tuple self._index_labels = ( tuple(index_labels) if index_labels else tuple([None for _ in index_columns]) ) self._expr = self._normalize_expression(expr, self._index_columns) - # TODO(tbergeron): Force callers to provide column labels + # Use pandas index to more easily replicate column indexing, especially for hierarchical column index self._column_labels = ( - tuple(column_labels) if column_labels else tuple(self.value_columns) + column_labels.copy() + if isinstance(column_labels, pd.Index) + else pd.Index(column_labels) ) if len(self.value_columns) != len(self._column_labels): raise ValueError( @@ -139,8 +146,8 @@ def value_columns(self) -> Sequence[str]: ] @property - def column_labels(self) -> List[Label]: - return list(self._column_labels) + def column_labels(self) -> pd.Index: + return self._column_labels @property def expr(self) -> core.ArrayValue: @@ -193,6 +200,24 @@ def index_name_to_col_id(self) -> typing.Mapping[Label, typing.Sequence[str]]: mapping[label] = (*mapping.get(label, ()), id) return mapping + def cols_matching_label(self, partial_label: Label) -> typing.Sequence[str]: + """ + Unlike label_to_col_id, this works with partial labels for multi-index. + + Only some methods, like __getitem__ can use a partial key to get columns + from a dataframe. These methods should use cols_matching_label, while + methods that require exact label matches should use label_to_col_id. + """ + # TODO(tbergeron): Refactor so that all label lookups use this method + if partial_label not in self.column_labels: + return [] + loc = self.column_labels.get_loc(partial_label) + if isinstance(loc, int): + return [self.value_columns[loc]] + if isinstance(loc, slice): + return self.value_columns[loc] + return [col for col, is_present in zip(self.value_columns, loc) if is_present] + def order_by( self, by: typing.Sequence[ordering.OrderingColumnReference], @@ -237,8 +262,9 @@ def reset_index(self, drop: bool = True) -> Block: index_labels=[None], ) else: + # Add index names to column index index_labels = self.index.names - index_labels_rewritten = [] + column_labels_modified = self.column_labels for level, label in enumerate(index_labels): if label is None: if "index" not in self.column_labels: @@ -248,12 +274,17 @@ def reset_index(self, drop: bool = True) -> Block: if label in self.column_labels: raise ValueError(f"cannot insert {label}, already exists") - index_labels_rewritten.append(label) + if isinstance(self.column_labels, pd.MultiIndex): + nlevels = self.column_labels.nlevels + label = tuple(label if i == 0 else "" for i in range(nlevels)) + # Create index copy with label inserted + # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html + column_labels_modified = column_labels_modified.insert(level, label) block = Block( expr, index_columns=[new_index_col_id], - column_labels=[*index_labels_rewritten, *self.column_labels], + column_labels=column_labels_modified, index_labels=[None], ) return block @@ -568,8 +599,11 @@ def _apply_value_keys_to_expr(self, value_keys: Optional[Iterable[str]] = None): expr = expr.select_columns(itertools.chain(self._index_columns, value_keys)) return expr - def with_column_labels(self, value: typing.Iterable[Label]) -> Block: - label_list = tuple(value) + def with_column_labels( + self, + value: typing.Union[pd.Index, typing.Iterable[Label]], + ) -> Block: + label_list = value.copy() if isinstance(value, pd.Index) else pd.Index(value) if len(label_list) != len(self.value_columns): raise ValueError( f"The column labels size `{len(label_list)} ` should equal to the value" @@ -742,7 +776,9 @@ def create_constant( ) -> typing.Tuple[Block, str]: result_id = guid.generate_guid() expr = self.expr.assign_constant(result_id, scalar_constant, dtype=dtype) - labels = [*self.column_labels, label] + # Create index copy with label inserted + # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html + labels = self.column_labels.insert(len(self.column_labels), label) return ( Block( expr, @@ -755,8 +791,11 @@ def create_constant( def assign_label(self, column_id: str, new_label: Label) -> Block: col_index = self.value_columns.index(column_id) - new_labels = list(self.column_labels) - new_labels[col_index] = new_label + # Create index copy with label inserted + # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html + new_labels = self.column_labels.insert(col_index, new_label).delete( + col_index + 1 + ) return self.with_column_labels(new_labels) def filter(self, column_name: str, keep_null: bool = False): @@ -790,7 +829,7 @@ def aggregate_all_and_pivot( result_expr = self.expr.aggregate( aggregations, dropna=dropna ).unpivot_single_row( - row_labels=self.column_labels, + row_labels=self.column_labels.to_list(), index_col_id="index", unpivot_columns=[(value_col_id, self.value_columns)], dtype=dtype, @@ -818,11 +857,28 @@ def drop_columns(self, ids_to_drop: typing.Sequence[str]) -> Block: labels = self._get_labels_for_columns(remaining_value_col_ids) return Block(expr, self.index_columns, labels, self.index.names) - def rename(self, *, columns: typing.Mapping[Label, Label]): - # TODO(tbergeron) Support function(Callable) as columns parameter. - col_labels = [ - (columns.get(col_label, col_label)) for col_label in self.column_labels - ] + def rename( + self, + *, + columns: typing.Mapping[Label, Label] | typing.Callable[[typing.Any], Label], + ): + if isinstance(columns, typing.Mapping): + + def remap_f(x): + return columns.get(x, x) + + else: + remap_f = columns + if isinstance(self.column_labels, pd.MultiIndex): + col_labels: list[Label] = [] + for col_label in self.column_labels: + # Mapper applies to each level separately + modified_label = tuple(remap_f(part) for part in col_label) + col_labels.append(modified_label) + else: + col_labels = [] + for col_label in self.column_labels: + col_labels.append(remap_f(col_label)) return self.with_column_labels(col_labels) def aggregate( @@ -874,10 +930,16 @@ def aggregate( ] by_column_labels = self._get_labels_for_columns(by_value_columns) labels = (*by_column_labels, *aggregate_labels) - result_expr_pruned = result_expr.select_columns( + result_expr_pruned, offsets_id = result_expr.select_columns( [*by_value_columns, *output_col_ids] + ).promote_offsets() + + return ( + Block( + result_expr_pruned, index_columns=[offsets_id], column_labels=labels + ), + output_col_ids, ) - return Block(result_expr_pruned, column_labels=labels), output_col_ids def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): """Gets aggregates immediately, and caches it""" @@ -891,7 +953,12 @@ def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): aggregations = [(column_id, stat, stat.name) for stat in stats_to_fetch] expr = self.expr.aggregate(aggregations) - block = Block(expr, column_labels=[s.name for s in stats_to_fetch]) + expr, offset_index_id = expr.promote_offsets() + block = Block( + expr, + index_columns=[offset_index_id], + column_labels=[s.name for s in stats_to_fetch], + ) df, _ = block.to_pandas() # Carefully extract stats such that they aren't coerced to a common type @@ -988,6 +1055,10 @@ def slice( ) return block + # Using cache to optimize for Jupyter Notebook's behavior where both '__repr__' + # and '__repr_html__' are called in a single display action, reducing redundant + # queries. + @functools.cache def retrieve_repr_request_results( self, max_results: int ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]: @@ -1038,13 +1109,7 @@ def add_prefix(self, prefix: str, axis: str | int | None = None) -> Block: index_labels=self.index.names, ) if axis_number == 1: - expr = self._expr - return Block( - self._expr, - index_columns=self.index_columns, - column_labels=[f"{prefix}{label}" for label in self.column_labels], - index_labels=self.index.names, - ) + return self.rename(columns=lambda label: f"{prefix}{label}") def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: axis_number = bigframes.core.utils.get_axis_number(axis) @@ -1061,13 +1126,110 @@ def add_suffix(self, suffix: str, axis: str | int | None = None) -> Block: index_labels=self.index.names, ) if axis_number == 1: - expr = self._expr - return Block( - self._expr, - index_columns=self.index_columns, - column_labels=[f"{label}{suffix}" for label in self.column_labels], - index_labels=self.index.names, + return self.rename(columns=lambda label: f"{label}{suffix}") + + def pivot( + self, + *, + columns: Sequence[str], + values: Sequence[str], + values_in_index: typing.Optional[bool] = None, + ): + # Columns+index should uniquely identify rows + # Warning: This is not validated, breaking this constraint will result in silently non-deterministic behavior. + # -1 to allow for ordering column in addition to pivot columns + max_unique_value = (_BQ_MAX_COLUMNS - 1) // len(values) + columns_values = self._get_unique_values(columns, max_unique_value) + column_index = columns_values + + column_ids: list[str] = [] + block = self + for value in values: + for uvalue in columns_values: + block, masked_id = self._create_pivot_col(block, columns, value, uvalue) + column_ids.append(masked_id) + + block = block.select_columns(column_ids) + aggregations = [(col_id, agg_ops.AnyValueOp()) for col_id in column_ids] + result_block, _ = block.aggregate( + by_column_ids=self.index_columns, + aggregations=aggregations, + as_index=True, + dropna=True, + ) + + if values_in_index or len(values) > 1: + value_labels = self._get_labels_for_columns(values) + column_index = self._create_pivot_column_index(value_labels, columns_values) + else: + column_index = columns_values + + return result_block.with_column_labels(column_index) + + @staticmethod + def _create_pivot_column_index( + value_labels: Sequence[typing.Hashable], columns_values: pd.Index + ): + index_parts = [] + for value in value_labels: + as_frame = columns_values.to_frame() + as_frame.insert(0, None, value) # type: ignore + ipart = pd.MultiIndex.from_frame( + as_frame, names=(None, *columns_values.names) ) + index_parts.append(ipart) + return functools.reduce(lambda x, y: x.append(y), index_parts) + + @staticmethod + def _create_pivot_col( + block: Block, columns: typing.Sequence[str], value_col: str, value + ) -> typing.Tuple[Block, str]: + cond_id = "" + nlevels = len(columns) + for i in range(len(columns)): + uvalue_level = value[i] if nlevels > 1 else value + if pd.isna(uvalue_level): + block, eq_id = block.apply_unary_op( + columns[i], + ops.isnull_op, + ) + else: + block, eq_id = block.apply_unary_op( + columns[i], ops.partial_right(ops.eq_op, uvalue_level) + ) + if cond_id: + block, cond_id = block.apply_binary_op(eq_id, cond_id, ops.and_op) + else: + cond_id = eq_id + block, masked_id = block.apply_binary_op( + value_col, cond_id, ops.partial_arg3(ops.where_op, None) + ) + + return block, masked_id + + def _get_unique_values( + self, columns: Sequence[str], max_unique_values: int + ) -> pd.Index: + """Gets N unique values for a column immediately.""" + # Importing here to avoid circular import + import bigframes.core.block_transforms as block_tf + import bigframes.dataframe as df + + unique_value_block = block_tf.drop_duplicates( + self.select_columns(columns), columns + ) + pd_values = ( + df.DataFrame(unique_value_block).head(max_unique_values + 1).to_pandas() + ) + if len(pd_values) > max_unique_values: + raise ValueError(f"Too many unique values: {pd_values}") + + if len(columns) > 1: + return pd.MultiIndex.from_frame( + pd_values.sort_values(by=list(pd_values.columns), na_position="first") + ) + else: + return pd.Index(pd_values.squeeze(axis=1).sort_values(na_position="first")) def concat( self, @@ -1138,8 +1300,9 @@ def block_from_local(data, session=None, use_index=True) -> Block: ) else: keys_expr = core.ArrayValue.mem_expr_from_pandas(pd_data, session) + keys_expr, offsets_id = keys_expr.promote_offsets() # Constructor will create default range index - return Block(keys_expr, column_labels=column_labels) + return Block(keys_expr, index_columns=[offsets_id], column_labels=column_labels) def _align_block_to_schema( diff --git a/bigframes/core/global_session.py b/bigframes/core/global_session.py new file mode 100644 index 0000000000..68529981cd --- /dev/null +++ b/bigframes/core/global_session.py @@ -0,0 +1,65 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utilities for managing a default, globally available Session object.""" + +import threading +from typing import Callable, Optional, TypeVar + +import bigframes._config +import bigframes.session + +_global_session: Optional[bigframes.session.Session] = None +_global_session_lock = threading.Lock() + + +def reset_session() -> None: + """Start a fresh session the next time a function requires a session. + + Closes the current session if it was already started. + + Returns: + None + """ + global _global_session + + with _global_session_lock: + if _global_session is not None: + _global_session.close() + _global_session = None + + bigframes._config.options.bigquery._session_started = False + + +def get_global_session(): + """Gets the global session. + + Creates the global session if it does not exist. + """ + global _global_session, _global_session_lock + + with _global_session_lock: + if _global_session is None: + _global_session = bigframes.session.connect( + bigframes._config.options.bigquery + ) + + return _global_session + + +_T = TypeVar("_T") + + +def with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: + return func(get_global_session(), *args, **kwargs) diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 10bee4f56d..5b217effdd 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -15,7 +15,8 @@ from __future__ import annotations import typing -import warnings + +import pandas as pd import bigframes.constants as constants import bigframes.core as core @@ -102,12 +103,12 @@ def __getitem__( def sum(self, numeric_only: bool = False, *args) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("sum") - return self._aggregate(agg_ops.sum_op, numeric_only=True) + return self._aggregate_all(agg_ops.sum_op, numeric_only=True) def mean(self, numeric_only: bool = False, *args) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("mean") - return self._aggregate(agg_ops.mean_op, numeric_only=True) + return self._aggregate_all(agg_ops.mean_op, numeric_only=True) def median( self, numeric_only: bool = False, *, exact: bool = False @@ -118,13 +119,13 @@ def median( ) if not numeric_only: self._raise_on_non_numeric("median") - return self._aggregate(agg_ops.median_op, numeric_only=True) + return self._aggregate_all(agg_ops.median_op, numeric_only=True) def min(self, numeric_only: bool = False, *args) -> df.DataFrame: - return self._aggregate(agg_ops.min_op, numeric_only=numeric_only) + return self._aggregate_all(agg_ops.min_op, numeric_only=numeric_only) def max(self, numeric_only: bool = False, *args) -> df.DataFrame: - return self._aggregate(agg_ops.max_op, numeric_only=numeric_only) + return self._aggregate_all(agg_ops.max_op, numeric_only=numeric_only) def std( self, @@ -133,7 +134,7 @@ def std( ) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("std") - return self._aggregate(agg_ops.std_op, numeric_only=True) + return self._aggregate_all(agg_ops.std_op, numeric_only=True) def var( self, @@ -142,16 +143,16 @@ def var( ) -> df.DataFrame: if not numeric_only: self._raise_on_non_numeric("var") - return self._aggregate(agg_ops.var_op, numeric_only=True) + return self._aggregate_all(agg_ops.var_op, numeric_only=True) def all(self) -> df.DataFrame: - return self._aggregate(agg_ops.all_op) + return self._aggregate_all(agg_ops.all_op) def any(self) -> df.DataFrame: - return self._aggregate(agg_ops.any_op) + return self._aggregate_all(agg_ops.any_op) def count(self) -> df.DataFrame: - return self._aggregate(agg_ops.count_op) + return self._aggregate_all(agg_ops.count_op) def cumsum(self, *args, numeric_only: bool = False, **kwargs) -> df.DataFrame: if not numeric_only: @@ -168,71 +169,97 @@ def cumprod(self, *args, **kwargs) -> df.DataFrame: return self._apply_window_op(agg_ops.product_op, numeric_only=True) def agg(self, func=None, **kwargs) -> df.DataFrame: - column_labels = [] if func: - warnings.warn( - "DataFrameGroupby aggregate produces single-level column labels only currently. Subject to change in future versions." - ) if isinstance(func, str): - aggregations = [ - (col_id, agg_ops.AGGREGATIONS_LOOKUP[func]) - for col_id in self._aggregated_columns() - ] + return self._agg_string(func) elif utils.is_dict_like(func): - aggregations = [] - for label, funcs_for_id in func.items(): - col_id = self._resolve_label(label) - func_list = ( - funcs_for_id - if utils.is_list_like(funcs_for_id) - else [funcs_for_id] - ) - for f in func_list: - aggregations.append((col_id, agg_ops.AGGREGATIONS_LOOKUP[f])) - # Pandas creates multi-index here instead - column_labels.append(f"{label}_{f}") + return self._agg_dict(func) elif utils.is_list_like(func): - aggregations = [ - (col_id, agg_ops.AGGREGATIONS_LOOKUP[f]) - for col_id in self._aggregated_columns() - for f in func - ] - column_labels = [ - f"{self._block.col_id_to_label[col_id]}_{f}" - for col_id in self._aggregated_columns() - for f in func - ] + return self._agg_list(func) else: raise NotImplementedError( f"Aggregate with {func} not supported. {constants.FEEDBACK_LINK}" ) else: - aggregations = [] - for k, v in kwargs.items(): - if not isinstance(k, str): - raise NotImplementedError( - f"Only string aggregate names supported. {constants.FEEDBACK_LINK}" - ) - if not hasattr(v, "column") or not hasattr(v, "aggfunc"): - import bigframes.pandas as bpd - - raise NotImplementedError( - f"kwargs values must be {bpd.NamedAgg.__qualname__}" - ) - col_id = self._resolve_label(v.column) - aggregations.append((col_id, agg_ops.AGGREGATIONS_LOOKUP[v.aggfunc])) - column_labels.append(k) + return self._agg_named(**kwargs) + def _agg_string(self, func: str) -> df.DataFrame: + aggregations = [ + (col_id, agg_ops.lookup_agg_func(func)) + for col_id in self._aggregated_columns() + ] agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, as_index=self._as_index, dropna=self._dropna, ) + return df.DataFrame(agg_block) - if column_labels: - agg_block = agg_block.with_column_labels(column_labels) + def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: + aggregations = [] + column_labels = [] + for label, funcs_for_id in func.items(): + col_id = self._resolve_label(label) + func_list = ( + funcs_for_id if utils.is_list_like(funcs_for_id) else [funcs_for_id] + ) + for f in func_list: + aggregations.append((col_id, agg_ops.lookup_agg_func(f))) + column_labels.append((col_id, f)) + agg_block, _ = self._block.aggregate( + by_column_ids=self._by_col_ids, + aggregations=aggregations, + as_index=self._as_index, + dropna=self._dropna, + ) + agg_block = agg_block.with_column_labels( + pd.MultiIndex.from_tuples(column_labels) + ) + return df.DataFrame(agg_block) + def _agg_list(self, func: typing.Sequence) -> df.DataFrame: + aggregations = [ + (col_id, agg_ops.lookup_agg_func(f)) + for col_id in self._aggregated_columns() + for f in func + ] + column_labels = [ + (col_id, f) for col_id in self._aggregated_columns() for f in func + ] + agg_block, _ = self._block.aggregate( + by_column_ids=self._by_col_ids, + aggregations=aggregations, + as_index=self._as_index, + dropna=self._dropna, + ) + agg_block = agg_block.with_column_labels( + pd.MultiIndex.from_tuples(column_labels) + ) + return df.DataFrame(agg_block) + + def _agg_named(self, **kwargs) -> df.DataFrame: + aggregations = [] + column_labels = [] + for k, v in kwargs.items(): + if not isinstance(k, str): + raise NotImplementedError( + f"Only string aggregate names supported. {constants.FEEDBACK_LINK}" + ) + if not hasattr(v, "column") or not hasattr(v, "aggfunc"): + import bigframes.pandas as bpd + + raise TypeError(f"kwargs values must be {bpd.NamedAgg.__qualname__}") + col_id = self._resolve_label(v.column) + aggregations.append((col_id, agg_ops.lookup_agg_func(v.aggfunc))) + column_labels.append(k) + agg_block, _ = self._block.aggregate( + by_column_ids=self._by_col_ids, + aggregations=aggregations, + as_index=self._as_index, + dropna=self._dropna, + ) + agg_block = agg_block.with_column_labels(column_labels) return df.DataFrame(agg_block) aggregate = agg @@ -261,7 +288,7 @@ def _column_type(self, col_id: str) -> dtypes.Dtype: dtype = self._block.dtypes[col_offset] return dtype - def _aggregate( + def _aggregate_all( self, aggregate_op: agg_ops.AggregateOp, numeric_only: bool = False ) -> df.DataFrame: aggregated_col_ids = self._aggregated_columns(numeric_only=numeric_only) @@ -359,11 +386,11 @@ def prod(self, *args) -> series.Series: def agg(self, func=None) -> typing.Union[df.DataFrame, series.Series]: column_names: list[str] = [] if isinstance(func, str): - aggregations = [(self._value_column, agg_ops.AGGREGATIONS_LOOKUP[func])] + aggregations = [(self._value_column, agg_ops.lookup_agg_func(func))] column_names = [func] elif utils.is_list_like(func): aggregations = [ - (self._value_column, agg_ops.AGGREGATIONS_LOOKUP[f]) for f in func + (self._value_column, agg_ops.lookup_agg_func(f)) for f in func ] column_names = list(func) else: diff --git a/bigframes/core/indexers.py b/bigframes/core/indexers.py index 0aaf169bea..46091f211a 100644 --- a/bigframes/core/indexers.py +++ b/bigframes/core/indexers.py @@ -29,7 +29,7 @@ import bigframes.series if typing.TYPE_CHECKING: - LocSingleKey = bigframes.series.Series | indexes.Index | slice + LocSingleKey = typing.Union[bigframes.series.Series, indexes.Index, slice] class LocSeriesIndexer: diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index d4b6e47025..5fbe5d1f9e 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -174,7 +174,11 @@ def __init__( self._query_job: Optional[bigquery.QueryJob] = None def __dir__(self): - return dir(type(self)) + self._block.column_labels + return dir(type(self)) + [ + label + for label in self._block.column_labels + if label and isinstance(label, str) + ] def _ipython_key_completions_(self) -> List[str]: return list( @@ -201,13 +205,16 @@ def _find_indices( col_ids = self._sql_names(columns, tolerance) return [self._block.value_columns.index(col_id) for col_id in col_ids] - def _resolve_label_exact(self, label) -> str: + def _resolve_label_exact(self, label) -> Optional[str]: + """Returns the column id matching the label if there is exactly + one such column. If there are multiple columns with the same name, + raises an error. If there is no such column, returns None.""" matches = self._block.label_to_col_id.get(label, []) - if len(matches) != 1: + if len(matches) > 1: raise ValueError( - f"Index data must be 1-dimensional. {constants.FEEDBACK_LINK}" + f"Multiple columns matching id {label} were found. {constants.FEEDBACK_LINK}" ) - return matches[0] + return matches[0] if len(matches) != 0 else None def _sql_names( self, @@ -215,7 +222,11 @@ def _sql_names( tolerance: bool = False, ) -> Sequence[str]: """Retrieve sql name (column name in BQ schema) of column(s).""" - labels = columns if utils.is_list_like(columns) else [columns] # type:ignore + labels = ( + columns + if utils.is_list_like(columns) and not isinstance(columns, tuple) + else [columns] + ) # type:ignore results: Sequence[str] = [] for label in labels: col_ids = self._block.label_to_col_id.get(label, []) @@ -246,6 +257,11 @@ def dtypes(self) -> pandas.Series: def columns(self) -> pandas.Index: return self.dtypes.index + @columns.setter + def columns(self, labels: pandas.Index): + new_block = self._block.with_column_labels(labels) + self._set_block(new_block) + @property def shape(self) -> Tuple[int, int]: return self._block.shape @@ -295,7 +311,7 @@ def _to_sql_query( # Has to be unordered as it is impossible to order the sql without # including metadata columns in selection with ibis. ibis_expr = self._block.expr.to_ibis_expr(ordering_mode="unordered") - column_labels = self._block.column_labels + column_labels = list(self._block.column_labels) # TODO(swast): Need to have a better way of controlling when to include # the index or not. @@ -387,11 +403,8 @@ def __getitem__( if isinstance(key, bigframes.series.Series): return self._getitem_bool_series(key) - sql_names = self._sql_names(key) - # Only input is a single key and only find one column, returns a Series - if (not utils.is_list_like(key)) and len(sql_names) == 1: - return bigframes.series.Series(self._block.select_column(sql_names[0])) - + if isinstance(key, typing.Hashable): + return self._getitem_label(key) # Select a subset of columns or re-order columns. # In Ibis after you apply a projection, any column objects from the # table before the projection can't be combined with column objects @@ -410,11 +423,31 @@ def __getitem__( selected_ids: Tuple[str, ...] = () for label in key: - col_ids = self._block.label_to_col_id.get(label, []) + col_ids = self._block.label_to_col_id[label] selected_ids = (*selected_ids, *col_ids) return DataFrame(self._block.select_columns(selected_ids)) + def _getitem_label(self, key: blocks.Label): + col_ids = self._block.cols_matching_label(key) + if len(col_ids) == 0: + raise KeyError(key) + block = self._block.select_columns(col_ids) + if isinstance(self.columns, pandas.MultiIndex): + # Multiindex should drop-level if not selecting entire + key_levels = len(key) if isinstance(key, tuple) else 1 + index_levels = self.columns.nlevels + if key_levels < index_levels: + block = block.with_column_labels( + block.column_labels.droplevel(list(range(key_levels))) + ) + # Force return DataFrame in this case, even if only single column + return DataFrame(block) + + if len(col_ids) == 1: + return bigframes.series.Series(block) + return DataFrame(block) + # Bool Series selects rows def _getitem_bool_series(self, key: bigframes.series.Series) -> DataFrame: if not key.dtype == pandas.BooleanDtype(): @@ -736,7 +769,7 @@ def drop( *, axis: typing.Union[int, str] = 0, index: typing.Any = None, - columns: Union[blocks.Label, Iterable[blocks.Label]] = None, + columns: Union[blocks.Label, Sequence[blocks.Label]] = None, level: typing.Optional[LevelType] = None, ) -> DataFrame: if labels: @@ -767,10 +800,6 @@ def drop( self._block.value_columns ) if columns: - if not utils.is_list_like(columns): - columns = [columns] # type:ignore - columns = list(columns) - block = block.drop_columns(self._sql_names(columns)) if not index and not columns: raise ValueError("Must specify 'labels' or 'index'/'columns") @@ -849,7 +878,7 @@ def _assign_single_item( def _assign_scalar(self, label: str, value: Union[int, float]) -> DataFrame: # TODO(swast): Make sure that k is the ID / SQL name, not a label, # which could be invalid SQL. - col_ids = self._sql_names(label, tolerance=True) + col_ids = self._block.cols_matching_label(label) block, constant_col_id = self._block.create_constant(value, label) for col_id in col_ids: @@ -868,7 +897,7 @@ def _assign_series_join_on_index( ) column_ids = [ - get_column_left(col_id) for col_id in self._sql_names(label, tolerance=True) + get_column_left(col_id) for col_id in self._block.cols_matching_label(label) ] block = joined_index._block source_column = get_column_right(series._value_column) @@ -903,7 +932,12 @@ def set_index( else: keys = typing.cast(typing.Sequence[blocks.Label], tuple(keys)) col_ids = [self._resolve_label_exact(key) for key in keys] - return DataFrame(self._block.set_index(col_ids, append=append, drop=drop)) + missing = [keys[i] for i in range(len(col_ids)) if col_ids[i] is None] + if len(missing) > 0: + raise KeyError(f"None of {missing} are in the columns") + # convert col_ids to non-optional strs since we just determined they are not None + col_ids_strs: List[str] = [col_id for col_id in col_ids if col_id is not None] + return DataFrame(self._block.set_index(col_ids_strs, append=append, drop=drop)) def sort_index( self, ascending: bool = True, na_position: Literal["first", "last"] = "last" @@ -932,7 +966,7 @@ def sort_values( if na_position not in {"first", "last"}: raise ValueError("Param na_position must be one of 'first' or 'last'") - sort_labels = tuple(by) if utils.is_list_like(by) else (by,) + sort_labels = list(by) if utils.is_list_like(by) else [by] sort_column_ids = self._sql_names(sort_labels) len_by = len(sort_labels) @@ -982,9 +1016,11 @@ def value_counts( return bigframes.series.Series(block) def add_prefix(self, prefix: str, axis: int | str | None = None) -> DataFrame: + axis = 1 if axis is None else axis return DataFrame(self._get_block().add_prefix(prefix, axis)) def add_suffix(self, suffix: str, axis: int | str | None = None) -> DataFrame: + axis = 1 if axis is None else axis return DataFrame(self._get_block().add_suffix(suffix, axis)) def dropna(self) -> DataFrame: @@ -1115,7 +1151,7 @@ def agg( raise NotImplementedError( f"Multiple aggregations only supported on numeric columns. {constants.FEEDBACK_LINK}" ) - aggregations = [agg_ops.AGGREGATIONS_LOOKUP[f] for f in func] + aggregations = [agg_ops.lookup_agg_func(f) for f in func] return DataFrame( self._block.summarize( self._block.value_columns, @@ -1125,7 +1161,7 @@ def agg( else: return bigframes.series.Series( self._block.aggregate_all_and_pivot( - agg_ops.AGGREGATIONS_LOOKUP[typing.cast(str, func)] + agg_ops.lookup_agg_func(typing.cast(str, func)) ) ) @@ -1142,6 +1178,37 @@ def describe(self) -> DataFrame: ) return typing.cast(DataFrame, result) + def pivot( + self, + *, + columns: typing.Union[blocks.Label, Sequence[blocks.Label]], + index: typing.Optional[ + typing.Union[blocks.Label, Sequence[blocks.Label]] + ] = None, + values: typing.Optional[ + typing.Union[blocks.Label, Sequence[blocks.Label]] + ] = None, + ) -> DataFrame: + if index: + block = self.set_index(index)._block + else: + block = self._block + + column_ids = self._sql_names(columns) + if values: + value_col_ids = self._sql_names(values) + else: + value_col_ids = [ + col for col in block.value_columns if col not in column_ids + ] + + pivot_block = block.pivot( + columns=column_ids, + values=value_col_ids, + values_in_index=utils.is_list_like(values), + ) + return DataFrame(pivot_block) + def _drop_non_numeric(self, keep_bool=True) -> DataFrame: types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) if not keep_bool: @@ -1263,7 +1330,10 @@ def merge( ) # Constructs default index - block = blocks.Block(expr, column_labels=labels) + expr, offset_index_id = expr.promote_offsets() + block = blocks.Block( + expr, index_columns=[offset_index_id], column_labels=labels + ) return DataFrame(block) def _get_merged_col_labels( diff --git a/bigframes/ml/base.py b/bigframes/ml/base.py index e4c68eb17c..9f9d9f85d0 100644 --- a/bigframes/ml/base.py +++ b/bigframes/ml/base.py @@ -22,9 +22,10 @@ """ import abc -from typing import cast, Optional, TypeVar +from typing import cast, Optional, TypeVar, Union from bigframes.ml import core +import bigframes.pandas as bpd import third_party.bigframes_vendored.sklearn.base @@ -143,3 +144,24 @@ def score(self, X, y): @abc.abstractmethod def to_gbq(self, model_name, replace): pass + + +class Transformer(BaseEstimator): + """A BigQuery DataFrames Transformer base class that transforms data. + + Also the transformers can be attached to a pipeline with a predictor.""" + + @abc.abstractmethod + def fit(self, X, y): + pass + + @abc.abstractmethod + def transform(self, X): + pass + + def fit_transform( + self, + X: Union[bpd.DataFrame, bpd.Series], + y: Optional[Union[bpd.DataFrame, bpd.Series]] = None, + ) -> bpd.DataFrame: + return self.fit(X, y).transform(X) diff --git a/bigframes/ml/compose.py b/bigframes/ml/compose.py index 49b4899beb..df01303ffa 100644 --- a/bigframes/ml/compose.py +++ b/bigframes/ml/compose.py @@ -34,7 +34,7 @@ class ColumnTransformer( third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer, - base.BaseEstimator, + base.Transformer, ): __doc__ = ( third_party.bigframes_vendored.sklearn.compose._column_transformer.ColumnTransformer.__doc__ diff --git a/bigframes/ml/preprocessing.py b/bigframes/ml/preprocessing.py index 500a9fcb24..ee46a37052 100644 --- a/bigframes/ml/preprocessing.py +++ b/bigframes/ml/preprocessing.py @@ -30,7 +30,7 @@ class StandardScaler( third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler, - base.BaseEstimator, + base.Transformer, ): __doc__ = ( third_party.bigframes_vendored.sklearn.preprocessing._data.StandardScaler.__doc__ @@ -106,7 +106,7 @@ def transform(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: class OneHotEncoder( third_party.bigframes_vendored.sklearn.preprocessing._encoder.OneHotEncoder, - base.BaseEstimator, + base.Transformer, ): # BQML max value https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-one-hot-encoder#syntax TOP_K_DEFAULT = 1000000 diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 7bdd97812e..58f19ea8e7 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -76,6 +76,12 @@ def _as_ibis(self, x: ibis_types.Value): return x.notnull() +class HashOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.IntegerValue, x).hash() + + +## String Operation class ReverseOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.StringValue, x).reverse() @@ -100,7 +106,58 @@ class IsNumericOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): # catches all members of the Unicode number class, which matches pandas isnumeric # see https://cloud.google.com/bigquery/docs/reference/standard-sql/string_functions#regexp_contains - return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN*)$") + # TODO: Validate correctness, my miss eg ⅕ character + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\pN+)$") + + +class IsAlphaOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +class IsDigitOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + # Based on docs, should include superscript/subscript-ed numbers + # Tests however pass only when set to Nd unicode class + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +class IsDecimalOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search(r"^(\p{Nd})+$") + + +class IsAlnumOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.StringValue, x).re_search( + r"^(\p{N}|\p{Lm}|\p{Lt}|\p{Lu}|\p{Ll}|\p{Lo})+$" + ) + + +class IsSpaceOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + # All characters are whitespace characters, False for empty string + return typing.cast(ibis_types.StringValue, x).re_search(r"^\s+$") + + +class IsLowerOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + # No upper case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search( + r"\p{Ll}" + ) & ~typing.cast(ibis_types.StringValue, x).re_search(r"\p{Lu}|\p{Lt}") + + +class IsUpperOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + # No lower case characters, min one cased character + # See: https://docs.python.org/3/library/stdtypes.html#str + return typing.cast(ibis_types.StringValue, x).re_search( + r"\p{Lu}" + ) & ~typing.cast(ibis_types.StringValue, x).re_search(r"\p{Ll}|\p{Lt}") class RstripOp(UnaryOp): @@ -227,11 +284,25 @@ def _as_ibis(self, x: ibis_types.Value): return any_match if any_match is not None else ibis_types.literal(False) -class HashOp(UnaryOp): +class ZfillOp(UnaryOp): + def __init__(self, width: int): + self._width = width + def _as_ibis(self, x: ibis_types.Value): - return typing.cast(ibis_types.IntegerValue, x).hash() + str_value = typing.cast(ibis_types.StringValue, x) + return ( + ibis.case() + .when( + str_value[0] == "-", + "-" + + StrPadOp(self._width - 1, "0", "left")._as_ibis(str_value.substr(1)), + ) + .else_(StrPadOp(self._width, "0", "left")._as_ibis(str_value)) + .end() + ) +## Datetime Ops class DayOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.TimestampValue, x).day() @@ -390,7 +461,14 @@ def _as_ibis(self, x: ibis_types.Value): lower_op = LowerOp() upper_op = UpperOp() strip_op = StripOp() +isalnum_op = IsAlnumOp() +isalpha_op = IsAlphaOp() +isdecimal_op = IsDecimalOp() +isdigit_op = IsDigitOp() isnumeric_op = IsNumericOp() +isspace_op = IsSpaceOp() +islower_op = IsLowerOp() +isupper_op = IsUpperOp() rstrip_op = RstripOp() lstrip_op = LstripOp() hash_op = HashOp() @@ -692,6 +770,18 @@ def clip_op( ) +def partial_arg1(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return lambda x, y: op(dtypes.literal_to_ibis_scalar(scalar, validate=False), x, y) + + +def partial_arg2(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return lambda x, y: op(x, dtypes.literal_to_ibis_scalar(scalar, validate=False), y) + + +def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: + return lambda x, y: op(x, y, dtypes.literal_to_ibis_scalar(scalar, validate=False)) + + def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) diff --git a/bigframes/operations/aggregations.py b/bigframes/operations/aggregations.py index 1687f705a1..874c264194 100644 --- a/bigframes/operations/aggregations.py +++ b/bigframes/operations/aggregations.py @@ -251,6 +251,21 @@ def skips_nulls(self): return False +class AnyValueOp(AggregateOp): + # Warning: only use if all values are equal. Non-deterministic otherwise. + # Do not expose to users. For special cases only (e.g. pivot). + name = "any_value" + + def _as_ibis( + self, column: ibis_types.Column, window=None + ) -> ibis_types.IntegerValue: + return _apply_window_if_present(column.arbitrary(), window) + + @property + def skips_nulls(self): + return True + + class RankOp(WindowOp): name = "rank" @@ -381,7 +396,7 @@ def _map_to_literal( # TODO: Alternative names and lookup from numpy function objects -AGGREGATIONS_LOOKUP: dict[str, AggregateOp] = { +_AGGREGATIONS_LOOKUP: dict[str, AggregateOp] = { op.name: op for op in [ sum_op, @@ -401,3 +416,18 @@ def _map_to_literal( ApproxQuartilesOp(3), ] } + + +def lookup_agg_func(key: str) -> AggregateOp: + if callable(key): + raise NotImplementedError( + "Aggregating with callable object not supported, pass method name as string instead (eg. 'sum' instead of np.sum)." + ) + if not isinstance(key, str): + raise ValueError( + f"Cannot aggregate using object of type: {type(key)}. Use string method name (eg. 'sum')" + ) + if key in _AGGREGATIONS_LOOKUP: + return _AGGREGATIONS_LOOKUP[key] + else: + raise ValueError(f"Unrecognize aggregate function: {key}") diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index 0f1395c78f..0545ea34d6 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -70,6 +70,41 @@ def upper(self) -> series.Series: def isnumeric(self) -> series.Series: return self._apply_unary_op(ops.isnumeric_op) + def isalpha( + self, + ) -> series.Series: + return self._apply_unary_op(ops.isalpha_op) + + def isdigit( + self, + ) -> series.Series: + return self._apply_unary_op(ops.isdigit_op) + + def isdecimal( + self, + ) -> series.Series: + return self._apply_unary_op(ops.isdecimal_op) + + def isalnum( + self, + ) -> series.Series: + return self._apply_unary_op(ops.isalnum_op) + + def isspace( + self, + ) -> series.Series: + return self._apply_unary_op(ops.isspace_op) + + def islower( + self, + ) -> series.Series: + return self._apply_unary_op(ops.islower_op) + + def isupper( + self, + ) -> series.Series: + return self._apply_unary_op(ops.isupper_op) + def rstrip(self) -> series.Series: return self._apply_unary_op(ops.rstrip_op) @@ -183,6 +218,12 @@ def endswith( pat = (pat,) return self._apply_unary_op(ops.EndsWithOp(pat)) + def zfill(self, width: int) -> series.Series: + return self._apply_unary_op(ops.ZfillOp(width)) + + def center(self, width: int, fillchar: str = " ") -> series.Series: + return self._apply_unary_op(ops.StrPadOp(width, fillchar, "both")) + def cat( self, others: Union[str, series.Series], diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index ed7a09e7b7..b688c18723 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -18,7 +18,6 @@ from collections import namedtuple import inspect -import threading import typing from typing import ( Any, @@ -32,15 +31,21 @@ Optional, Sequence, Tuple, - TypeVar, Union, ) from google.cloud import bigquery import numpy import pandas +from pandas._typing import ( + CompressionOptions, + FilePath, + ReadPickleBuffer, + StorageOptions, +) import bigframes._config as config +import bigframes.core.global_session as global_session import bigframes.core.indexes import bigframes.core.reshape import bigframes.dataframe @@ -49,14 +54,6 @@ import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile -# Support pandas dtype attribute -NA = pandas.NA -BooleanDtype = pandas.BooleanDtype -Float64Dtype = pandas.Float64Dtype -Int64Dtype = pandas.Int64Dtype -StringDtype = pandas.StringDtype -ArrowDtype = pandas.ArrowDtype - # Include method definition so that the method appears in our docs for # bigframes.pandas general functions. @@ -135,52 +132,6 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ -options = config.options -"""Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" - -_global_session: Optional[bigframes.session.Session] = None -_global_session_lock = threading.Lock() - - -def reset_session() -> None: - """Start a fresh session the next time a function requires a session. - - Closes the current session if it was already started. - - Returns: - None - """ - global _global_session - - with _global_session_lock: - if _global_session is not None: - _global_session.close() - _global_session = None - - options.bigquery._session_started = False - - -def get_global_session(): - """Gets the global session. - - Creates the global session if it does not exist. - """ - global _global_session, _global_session_lock - - with _global_session_lock: - if _global_session is None: - _global_session = bigframes.session.connect(options.bigquery) - - return _global_session - - -_T = TypeVar("_T") - - -def _with_default_session(func: Callable[..., _T], *args, **kwargs) -> _T: - return func(get_global_session(), *args, **kwargs) - - def _set_default_session_location_if_possible(query): # Set the location as per the query if this is the first query the user is # running and: @@ -257,7 +208,7 @@ def read_csv( encoding: Optional[str] = None, **kwargs, ) -> bigframes.dataframe.DataFrame: - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_csv, filepath_or_buffer=filepath_or_buffer, sep=sep, @@ -283,7 +234,7 @@ def read_gbq( max_results: Optional[int] = None, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_gbq, query, index_col=index_col, @@ -296,7 +247,7 @@ def read_gbq( def read_gbq_model(model_name: str): - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_gbq_model, model_name, ) @@ -313,7 +264,7 @@ def read_gbq_query( max_results: Optional[int] = None, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_gbq_query, query, index_col=index_col, @@ -333,7 +284,7 @@ def read_gbq_table( max_results: Optional[int] = None, ) -> bigframes.dataframe.DataFrame: _set_default_session_location_if_possible(query) - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_gbq_table, query, index_col=index_col, @@ -346,7 +297,7 @@ def read_gbq_table( def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataFrame: - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_pandas, pandas_dataframe, ) @@ -355,8 +306,24 @@ def read_pandas(pandas_dataframe: pandas.DataFrame) -> bigframes.dataframe.DataF read_pandas.__doc__ = inspect.getdoc(bigframes.session.Session.read_pandas) +def read_pickle( + filepath_or_buffer: FilePath | ReadPickleBuffer, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, +): + return global_session.with_default_session( + bigframes.session.Session.read_pickle, + filepath_or_buffer=filepath_or_buffer, + compression=compression, + storage_options=storage_options, + ) + + +read_pickle.__doc__ = inspect.getdoc(bigframes.session.Session.read_pickle) + + def read_parquet(path: str | IO["bytes"]) -> bigframes.dataframe.DataFrame: - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_parquet, path, ) @@ -372,7 +339,7 @@ def remote_function( bigquery_connection: Optional[str] = None, reuse: bool = True, ): - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.remote_function, input_types=input_types, output_type=output_type, @@ -386,7 +353,7 @@ def remote_function( def read_gbq_function(function_name: str): - return _with_default_session( + return global_session.with_default_session( bigframes.session.Session.read_gbq_function, function_name=function_name, ) @@ -395,25 +362,58 @@ def read_gbq_function(function_name: str): read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) -# Other aliases +# pandas dtype attributes +NA = pandas.NA +BooleanDtype = pandas.BooleanDtype +Float64Dtype = pandas.Float64Dtype +Int64Dtype = pandas.Int64Dtype +StringDtype = pandas.StringDtype +ArrowDtype = pandas.ArrowDtype + +# Class aliases +# TODO(swast): Make these real classes so we can refer to these in type +# checking and docstrings. DataFrame = bigframes.dataframe.DataFrame Index = bigframes.core.indexes.Index Series = bigframes.series.Series -# Used by DataFrameGroupby.agg +# Other public pandas attributes NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"]) +options = config.options +"""Global :class:`~bigframes._config.Options` to configure BigQuery DataFrames.""" + +# Session management APIs +get_global_session = global_session.get_global_session +reset_session = global_session.reset_session + + # Use __all__ to let type checkers know what is part of the public API. __all___ = [ + # Functions "concat", - "DataFrame", - "options", "read_csv", "read_gbq", "read_gbq_function", "read_gbq_model", "read_pandas", + "read_pickle", "remote_function", + # pandas dtype attributes + "NA", + "BooleanDtype", + "Float64Dtype", + "Int64Dtype", + "StringDtype", + "ArrowDtype" + # Class aliases + "DataFrame", + "Index", "Series", + # Other public pandas attributes "NamedAgg", + "options", + # Session management APIs + "get_global_session", + "reset_session", ] diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 27e2b8f7c2..7cf74d6311 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -35,15 +35,10 @@ import cloudpickle import google.api_core.exceptions from google.cloud import bigquery, bigquery_connection_v1, functions_v2 -from google.cloud.bigquery.routine import Routine -from google.cloud.bigquery.standard_sql import StandardSqlTypeNames from ibis.backends.bigquery.compiler import compiles from ibis.backends.bigquery.datatypes import BigQueryType -from ibis.expr.datatypes.core import boolean from ibis.expr.datatypes.core import DataType as IbisDataType from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type -from ibis.expr.datatypes.core import float64, int64 -from ibis.expr.datatypes.core import string as ibis_string import ibis.expr.operations as ops import ibis.expr.rules as rlz @@ -63,11 +58,16 @@ # Input and output types supported by BigQuery DataFrames remote functions. # TODO(shobs): Extend the support to all types supported by BQ remote functions # https://cloud.google.com/bigquery/docs/remote-functions#limitations -_supported_io_ibis_types = {boolean, float64, int64, ibis_string} -TYPE_ERROR_MESSAGE_FORMAT = ( - f"Type {{}} not supported, supported types are {_supported_io_ibis_types}. " - f"{constants.FEEDBACK_LINK}" -) +SUPPORTED_IO_PYTHON_TYPES = {bool, float, int, str} +SUPPORTED_IO_BIGQUERY_TYPEKINDS = { + "BOOLEAN", + "BOOL", + "FLOAT", + "FLOAT64", + "INT64", + "INTEGER", + "STRING", +} def get_remote_function_locations(bq_location): @@ -116,7 +116,7 @@ def routine_ref_to_string_for_query(routine_ref: bigquery.RoutineReference) -> s class IbisSignature(NamedTuple): parameter_names: List[str] - input_types: List[IbisDataType] + input_types: List[Optional[IbisDataType]] output_type: IbisDataType @@ -512,7 +512,7 @@ def remote_function_node( """Creates an Ibis node representing a remote function call.""" fields = { - name: rlz.value(type_) + name: rlz.value(type_) if type_ else rlz.any for name, type_ in zip( ibis_signature.parameter_names, ibis_signature.input_types ) @@ -538,20 +538,22 @@ def f(*args, **kwargs): return f +class UnsupportedTypeError(ValueError): + def __init__(self, type_, supported_types): + self.type = type_ + self.supported_types = supported_types + + def ibis_type_from_python_type(t: type) -> IbisDataType: - ibis_type = python_type_to_bigquery_type(t) - assert ibis_type in _supported_io_ibis_types, TYPE_ERROR_MESSAGE_FORMAT.format( - ibis_type - ) - return ibis_type + if t not in SUPPORTED_IO_PYTHON_TYPES: + raise UnsupportedTypeError(t, SUPPORTED_IO_PYTHON_TYPES) + return python_type_to_bigquery_type(t) -def ibis_type_from_type_kind(tk: StandardSqlTypeNames) -> IbisDataType: - ibis_type = BigQueryType.to_ibis(tk) - assert ibis_type in _supported_io_ibis_types, TYPE_ERROR_MESSAGE_FORMAT.format( - ibis_type - ) - return ibis_type +def ibis_type_from_type_kind(tk: bigquery.StandardSqlTypeNames) -> IbisDataType: + if tk not in SUPPORTED_IO_BIGQUERY_TYPEKINDS: + raise UnsupportedTypeError(tk, SUPPORTED_IO_BIGQUERY_TYPEKINDS) + return BigQueryType.to_ibis(tk) def ibis_signature_from_python_signature( @@ -566,13 +568,18 @@ def ibis_signature_from_python_signature( ) -def ibis_signature_from_routine( - routine: Routine, -) -> IbisSignature: +class ReturnTypeMissingError(ValueError): + pass + + +def ibis_signature_from_routine(routine: bigquery.Routine) -> IbisSignature: + if not routine.return_type: + raise ReturnTypeMissingError + return IbisSignature( parameter_names=[arg.name for arg in routine.arguments], input_types=[ - ibis_type_from_type_kind(arg.data_type.type_kind) + ibis_type_from_type_kind(arg.data_type.type_kind) if arg.data_type else None for arg in routine.arguments ], output_type=ibis_type_from_type_kind(routine.return_type.type_kind), @@ -584,9 +591,7 @@ class DatasetMissingError(ValueError): def get_routine_reference( - routine_ref_str: str, - bigquery_client: bigquery.Client, - session: Optional[Session], + routine_ref_str: str, bigquery_client: bigquery.Client, session: Optional[Session] ) -> bigquery.RoutineReference: try: # Handle cases ".." and @@ -859,7 +864,21 @@ def read_gbq_function( ) # Find the routine and get its arguments. - routine = bigquery_client.get_routine(routine_ref) - ibis_signature = ibis_signature_from_routine(routine) + try: + routine = bigquery_client.get_routine(routine_ref) + except google.api_core.exceptions.NotFound: + raise ValueError(f"Unknown function '{routine_ref}'. {constants.FEEDBACK_LINK}") + + try: + ibis_signature = ibis_signature_from_routine(routine) + except ReturnTypeMissingError: + raise ValueError( + "Function return type must be specified. {constants.FEEDBACK_LINK}" + ) + except UnsupportedTypeError as e: + raise ValueError( + f"Type {e.type} not supported, supported types are {e.supported_types}. " + f"{constants.FEEDBACK_LINK}" + ) return remote_function_node(routine_ref, ibis_signature) diff --git a/bigframes/series.py b/bigframes/series.py index f8f44dc2e6..a1da93dee3 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -651,7 +651,7 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: raise NotImplementedError( f"Multiple aggregations only supported on numeric series. {constants.FEEDBACK_LINK}" ) - aggregations = [agg_ops.AGGREGATIONS_LOOKUP[f] for f in func] + aggregations = [agg_ops.lookup_agg_func(f) for f in func] return Series( self._block.summarize( [self._value_column], @@ -661,7 +661,7 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series: else: return self._apply_aggregation( - agg_ops.AGGREGATIONS_LOOKUP[typing.cast(str, func)] + agg_ops.lookup_agg_func(typing.cast(str, func)) ) def skew(self): diff --git a/bigframes/session.py b/bigframes/session.py index 73fdd73106..3ef5250746 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -54,6 +54,12 @@ import ibis.expr.types as ibis_types import numpy as np import pandas +from pandas._typing import ( + CompressionOptions, + FilePath, + ReadPickleBuffer, + StorageOptions, +) import pydata_google_auth import bigframes._config.bigquery_options as bigquery_options @@ -75,6 +81,7 @@ import third_party.bigframes_vendored.pandas.io.gbq as third_party_pandas_gbq import third_party.bigframes_vendored.pandas.io.parquet as third_party_pandas_parquet import third_party.bigframes_vendored.pandas.io.parsers.readers as third_party_pandas_readers +import third_party.bigframes_vendored.pandas.io.pickle as third_party_pandas_pickle _ENV_DEFAULT_PROJECT = "GOOGLE_CLOUD_PROJECT" _APPLICATION_NAME = f"bigframes/{bigframes.version.__version__}" @@ -194,6 +201,7 @@ def _create_cloud_clients( class Session( third_party_pandas_gbq.GBQIOMixin, third_party_pandas_parquet.ParquetIOMixin, + third_party_pandas_pickle.PickleIOMixin, third_party_pandas_readers.ReaderIOMixin, ): """Establishes a BigQuery connection to capture a group of job activities related to @@ -252,6 +260,8 @@ def _create_and_bind_bq_session(self): """Create a BQ session and bind the session id with clients to capture BQ activities: go/bigframes-transient-data""" job_config = bigquery.QueryJobConfig(create_session=True) + # Make sure the session is a new one, not one associated with another query. + job_config.use_query_cache = False query_job = self.bqclient.query( "SELECT 1", job_config=job_config, location=self._location ) @@ -458,8 +468,8 @@ def read_gbq_table( {self.ibis_client.compile(distinct_table)} ) - SELECT (SELECT COUNT(*) FROM full_table) AS total_count, - (SELECT COUNT(*) FROM distinct_table) AS distinct_count + SELECT (SELECT COUNT(*) FROM full_table) AS `total_count`, + (SELECT COUNT(*) FROM distinct_table) AS `distinct_count` """ results, query_job = self._start_query(is_unique_sql) row = next(iter(results)) @@ -467,6 +477,7 @@ def read_gbq_table( total_count = row["total_count"] distinct_count = row["distinct_count"] is_total_ordering = total_count == distinct_count + ordering = core.ExpressionOrdering( ordering_value_columns=[ core.OrderingColumnReference(column_id) for column_id in index_cols @@ -477,7 +488,6 @@ def read_gbq_table( # We have a total ordering, so query via "time travel" so that # the underlying data doesn't mutate. if is_total_ordering: - # Get the timestamp from the job metadata rather than the query # text so that the query for determining uniqueness of the ID # columns can be cached. @@ -663,7 +673,8 @@ def _read_ibis( core.ArrayValue( self, table_expression, columns, hidden_ordering_columns, ordering ), - [index_col.get_name() for index_col in index_cols], + index_columns=[index_col.get_name() for index_col in index_cols], + column_labels=column_keys, index_labels=index_labels, ) @@ -887,6 +898,25 @@ def read_csv( ) return self.read_pandas(pandas_df) + def read_pickle( + self, + filepath_or_buffer: FilePath | ReadPickleBuffer, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + ): + pandas_obj = pandas.read_pickle( + filepath_or_buffer, + compression=compression, + storage_options=storage_options, + ) + + if isinstance(pandas_obj, pandas.Series): + if pandas_obj.name is None: + pandas_obj.name = "0" + bigframes_df = self.read_pandas(pandas_obj.to_frame()) + return bigframes_df[bigframes_df.columns[0]] + return self.read_pandas(pandas_obj) + def read_parquet( self, path: str | IO["bytes"], @@ -1086,6 +1116,10 @@ def read_gbq_function( Then it can be applied to a DataFrame or Series. + .. note:: + The return type of the function must be explicitly specified in the + function's original definition even if not otherwise required. + Args: function_name (str): the function's name in BigQuery in the format diff --git a/docs/index.rst b/docs/index.rst index ff1cd09eb7..d239ea3a78 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,7 +6,6 @@ API reference .. toctree:: :maxdepth: 3 - user_guide/index reference/index Changelog diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml index 91abc59bc0..891f15a51b 100644 --- a/docs/templates/toc.yml +++ b/docs/templates/toc.yml @@ -128,3 +128,4 @@ name: preprocessing name: bigframes.ml name: BigQuery DataFrames + status: beta diff --git a/docs/user_guide/bigframes.pandas/remote_functions.rst b/docs/user_guide/bigframes.pandas/remote_functions.rst deleted file mode 100644 index 7540ba8a28..0000000000 --- a/docs/user_guide/bigframes.pandas/remote_functions.rst +++ /dev/null @@ -1,134 +0,0 @@ - -Using the Remote Functions -========================== - -BigQuery DataFrames gives you the ability to turn your custom scalar functions -into a BigQuery remote function. It requires the GCP project to be set up -appropriately and the user having sufficient privileges to use them. One can -find more details on it via `help` command. - -.. code-block:: python - - import bigframes.pandas as bpd - help(bpd.remote_function) - -Read a table and inspect the column of interest. - -.. code-block:: python - - df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") - df["body_mass_g"].head(10) - -Define a custom function, and specify the intent to turn it into a remote -function. It requires a BigQuery connection. If the connection is not already -created, BigQuery DataFrames will attempt to create one assuming the necessary -APIs and IAM permissions are setup in the project. In our examples we would be -using a pre-created connection named `bigframes-rf-conn`. Let's try a -`pandas`-like use case in which we want to apply a user defined scalar function -to every value in a `Series`, more specifically bucketize the `body_mass_g` value -of the penguins, which is a real number, into a category, which is a string. - -.. code-block:: python - - @bpd.remote_function([float], str, bigquery_connection='bigframes-rf-conn') - def get_bucket(num): - if not num: return "NA" - boundary = 4000 - return "at_or_above_4000" if num >= boundary else "below_4000" - -Then we can apply the remote function on the `Series`` of interest via `apply` -API and store the result in a new column in the DataFrame. - -.. code-block:: python - - df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket)) - -This will add a new column `body_mass_bucket` in the DataFrame. You can preview -the original value and the bucketized value side by side. - -.. code-block:: python - - df[['body_mass_g', 'body_mass_bucket']].head(10) - -This operation was possible by doing all the computation on the cloud. For that, -there is a google cloud function deployed by serializing the user code. - -.. warning:: - The deployed cloud function may be visible to other users with sufficient - privilege in the project. The user should be careful about having any - sensitive data in the code that will be deployed as a remote function. - -The cloud function can be located from a property set in the remote function object. - -.. code-block:: python - - get_bucket.bigframes_cloud_function - -and then there is a BigQuery remote function created configured to call into the -cloud function via the BigQuery connection. That can also be located from -another property set in the remote function object. - -.. code-block:: python - - get_bucket.bigframes_remote_function - -The cloud assets created are persistant and the user can manage them directy -from the Google Cloud Console. - -Let's continue trying other potential use cases of remote functions. Let's say -we consider the `species`, `island` and `sex` of the penguins sensitive -information and want to redact that by replacing with their hash code instead. -Let's define another scalar custom function and decorated it as a remote function: - -.. code-block:: python - - @bpd.remote_function([str], str, bigquery_connection='bigframes-rf-conn') - def get_hash(input): - import hashlib - # handle missing value - if input is None: - input = "" - encoded_input = input.encode() - hash = hashlib.md5(encoded_input) - return hash.hexdigest() - -We can use this remote function in another `pandas`-like API `map` that can be -applied on a DataFrame: - -.. code-block:: python - - df_redacted = df[["species", "island", "sex"]].map(get_hash) - df_redacted.head(10). - -Using Existing Functions -======================== - -If you have already defined a custom function in BigQuery, either in the -BigQuery Google Cloud Console or with the `remote_function` decorator above or -otherwise, you may use it with BigQuery DataFrames with the `read_gbq_function` -method. - -More details are available via the `help` command: - -.. code-block:: python - - import bigframes.pandas as pd - help(pd.read_gbq_function) - -Here is an example of using `read_gbq_function` to load an existing function -named `get_bucket`: - -.. code-block:: python - - import bigframes.pandas as pd - - df = pd.read_gbq("bigquery-public-data.ml_datasets.penguins") - get_bucket = pd.read_gbq_function("get_bucket") - - df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket)) - df.head(10) - -Note: As mentioned above, if a function is created using the `remote_function` -decorator, its generated name (including project and dataset) is accessible -immediately afterward in the function's `bigframes_remote_function` attribute. -The same string can be passed to `read_gbq_function` later in another context. diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst deleted file mode 100644 index 915e172159..0000000000 --- a/docs/user_guide/index.rst +++ /dev/null @@ -1,9 +0,0 @@ -.. _user_guide: - -User Guide -========== - -.. toctree:: - :maxdepth: 2 - - bigframes.pandas/remote_functions diff --git a/notebooks/00 - Summary.ipynb b/notebooks/00 - Summary.ipynb deleted file mode 100644 index 66ac9a8de8..0000000000 --- a/notebooks/00 - Summary.ipynb +++ /dev/null @@ -1,2060 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using the BigQuery DataFrames API" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Set BigQuery DataFrames options" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import bigframes.pandas\n", - "\n", - "bigframes.pandas.options.bigquery.project = \"bigframes-dev\"\n", - "bigframes.pandas.options.bigquery.location = \"us\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialize a dataframe for a BigQuery table" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "df = bigframes.pandas.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## View the DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Adelie Penguin (Pygoscelis adeliae)Dream36.618.4184.03475.0FEMALE
1Adelie Penguin (Pygoscelis adeliae)Dream39.819.1184.04650.0MALE
2Adelie Penguin (Pygoscelis adeliae)Dream40.918.9184.03900.0MALE
3Chinstrap penguin (Pygoscelis antarctica)Dream46.517.9192.03500.0FEMALE
4Adelie Penguin (Pygoscelis adeliae)Dream37.316.8192.03000.0FEMALE
5Adelie Penguin (Pygoscelis adeliae)Dream43.218.5192.04100.0MALE
6Chinstrap penguin (Pygoscelis antarctica)Dream46.916.6192.02700.0FEMALE
7Chinstrap penguin (Pygoscelis antarctica)Dream50.518.4200.03400.0FEMALE
8Chinstrap penguin (Pygoscelis antarctica)Dream49.519.0200.03800.0MALE
9Adelie Penguin (Pygoscelis adeliae)Dream40.220.1200.03975.0MALE
10Adelie Penguin (Pygoscelis adeliae)Dream40.818.9208.04300.0MALE
11Adelie Penguin (Pygoscelis adeliae)Dream39.018.7185.03650.0MALE
12Adelie Penguin (Pygoscelis adeliae)Dream37.016.9185.03000.0FEMALE
13Chinstrap penguin (Pygoscelis antarctica)Dream47.017.3185.03700.0FEMALE
14Adelie Penguin (Pygoscelis adeliae)Dream34.017.1185.03400.0FEMALE
15Adelie Penguin (Pygoscelis adeliae)Dream37.016.5185.03400.0FEMALE
16Chinstrap penguin (Pygoscelis antarctica)Dream45.717.3193.03600.0FEMALE
17Chinstrap penguin (Pygoscelis antarctica)Dream50.619.4193.03800.0MALE
18Adelie Penguin (Pygoscelis adeliae)Dream39.717.9193.04250.0MALE
19Adelie Penguin (Pygoscelis adeliae)Dream37.818.1193.03750.0MALE
\n", - "
[344 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.4 184.0 3475.0 FEMALE \n", - "1 19.1 184.0 4650.0 MALE \n", - "2 18.9 184.0 3900.0 MALE \n", - "3 17.9 192.0 3500.0 FEMALE \n", - "4 16.8 192.0 3000.0 FEMALE \n", - "5 18.5 192.0 4100.0 MALE \n", - "6 16.6 192.0 2700.0 FEMALE \n", - "7 18.4 200.0 3400.0 FEMALE \n", - "8 19.0 200.0 3800.0 MALE \n", - "9 20.1 200.0 3975.0 MALE \n", - "10 18.9 208.0 4300.0 MALE \n", - "11 18.7 185.0 3650.0 MALE \n", - "12 16.9 185.0 3000.0 FEMALE \n", - "13 17.3 185.0 3700.0 FEMALE \n", - "14 17.1 185.0 3400.0 FEMALE \n", - "15 16.5 185.0 3400.0 FEMALE \n", - "16 17.3 193.0 3600.0 FEMALE \n", - "17 19.4 193.0 3800.0 MALE \n", - "18 17.9 193.0 4250.0 MALE \n", - "19 18.1 193.0 3750.0 MALE \n", - "20 17.8 193.0 3800.0 FEMALE \n", - "21 19.2 193.0 3650.0 MALE \n", - "22 17.1 193.0 3400.0 FEMALE \n", - "23 18.5 193.0 3500.0 FEMALE \n", - "24 18.2 193.0 3775.0 MALE \n", - "...\n", - "\n", - "[344 rows x 7 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View the column names in the dataframe (aka columns names in the table)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['species', 'island', 'culmen_length_mm', 'culmen_depth_mm',\n", - " 'flipper_length_mm', 'body_mass_g', 'sex'],\n", - " dtype='object')" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.columns" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View the table schema" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "species string[pyarrow]\n", - "island string[pyarrow]\n", - "culmen_length_mm Float64\n", - "culmen_depth_mm Float64\n", - "flipper_length_mm Float64\n", - "body_mass_g Float64\n", - "sex string[pyarrow]\n", - "dtype: object" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.dtypes" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select a subset of columns" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandbody_mass_g
0Adelie Penguin (Pygoscelis adeliae)Dream3475.0
1Adelie Penguin (Pygoscelis adeliae)Dream4650.0
2Adelie Penguin (Pygoscelis adeliae)Dream3900.0
3Chinstrap penguin (Pygoscelis antarctica)Dream3500.0
4Adelie Penguin (Pygoscelis adeliae)Dream3000.0
5Adelie Penguin (Pygoscelis adeliae)Dream4100.0
6Chinstrap penguin (Pygoscelis antarctica)Dream2700.0
7Chinstrap penguin (Pygoscelis antarctica)Dream3400.0
8Chinstrap penguin (Pygoscelis antarctica)Dream3800.0
9Adelie Penguin (Pygoscelis adeliae)Dream3975.0
10Adelie Penguin (Pygoscelis adeliae)Dream4300.0
11Adelie Penguin (Pygoscelis adeliae)Dream3650.0
12Adelie Penguin (Pygoscelis adeliae)Dream3000.0
13Chinstrap penguin (Pygoscelis antarctica)Dream3700.0
14Adelie Penguin (Pygoscelis adeliae)Dream3400.0
15Adelie Penguin (Pygoscelis adeliae)Dream3400.0
16Chinstrap penguin (Pygoscelis antarctica)Dream3600.0
17Chinstrap penguin (Pygoscelis antarctica)Dream3800.0
18Adelie Penguin (Pygoscelis adeliae)Dream4250.0
19Adelie Penguin (Pygoscelis adeliae)Dream3750.0
\n", - "
[344 rows x 3 columns in total]" - ], - "text/plain": [ - " species island body_mass_g\n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream 3475.0\n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 4650.0\n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream 3900.0\n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 3500.0\n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream 3000.0\n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 4100.0\n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream 2700.0\n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream 3400.0\n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream 3975.0\n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0\n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 3650.0\n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream 3000.0\n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream 3700.0\n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0\n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0\n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream 3600.0\n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 4250.0\n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 3750.0\n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 3800.0\n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream 3650.0\n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream 3400.0\n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 3500.0\n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 3775.0\n", - "...\n", - "\n", - "[344 rows x 3 columns]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df[[\n", - " \"species\",\n", - " \"island\",\n", - " \"body_mass_g\",\n", - "]]\n", - "df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### View the first ten values of a series" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 3475.0\n", - "1 4650.0\n", - "2 3900.0\n", - "3 3500.0\n", - "4 3000.0\n", - "5 4100.0\n", - "6 2700.0\n", - "7 3400.0\n", - "8 3800.0\n", - "9 3975.0\n", - "Name: body_mass_g, dtype: Float64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['body_mass_g'].head(10)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Compute the mean of a series" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4201.7543859649095" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['body_mass_g'].mean()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Filter the DataFrame" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandbody_mass_g
1Adelie Penguin (Pygoscelis adeliae)Dream4650.0
5Adelie Penguin (Pygoscelis adeliae)Dream4100.0
10Adelie Penguin (Pygoscelis adeliae)Dream4300.0
18Adelie Penguin (Pygoscelis adeliae)Dream4250.0
25Chinstrap penguin (Pygoscelis antarctica)Dream4050.0
26Adelie Penguin (Pygoscelis adeliae)Dream4000.0
27Chinstrap penguin (Pygoscelis antarctica)Dream4050.0
28Chinstrap penguin (Pygoscelis antarctica)Dream4300.0
30Chinstrap penguin (Pygoscelis antarctica)Dream4450.0
36Adelie Penguin (Pygoscelis adeliae)Dream4450.0
44Chinstrap penguin (Pygoscelis antarctica)Dream4100.0
45Chinstrap penguin (Pygoscelis antarctica)Dream4800.0
57Chinstrap penguin (Pygoscelis antarctica)Dream4400.0
61Chinstrap penguin (Pygoscelis antarctica)Dream4150.0
66Chinstrap penguin (Pygoscelis antarctica)Dream4050.0
67Chinstrap penguin (Pygoscelis antarctica)Dream4050.0
68Chinstrap penguin (Pygoscelis antarctica)Dream4100.0
74Adelie Penguin (Pygoscelis adeliae)Dream4350.0
77Adelie Penguin (Pygoscelis adeliae)Dream4150.0
78Adelie Penguin (Pygoscelis adeliae)Dream4400.0
\n", - "
[177 rows x 3 columns in total]" - ], - "text/plain": [ - " species island body_mass_g\n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 4650.0\n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 4100.0\n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0\n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 4250.0\n", - "25 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", - "26 Adelie Penguin (Pygoscelis adeliae) Dream 4000.0\n", - "27 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", - "28 Chinstrap penguin (Pygoscelis antarctica) Dream 4300.0\n", - "30 Chinstrap penguin (Pygoscelis antarctica) Dream 4450.0\n", - "36 Adelie Penguin (Pygoscelis adeliae) Dream 4450.0\n", - "44 Chinstrap penguin (Pygoscelis antarctica) Dream 4100.0\n", - "45 Chinstrap penguin (Pygoscelis antarctica) Dream 4800.0\n", - "57 Chinstrap penguin (Pygoscelis antarctica) Dream 4400.0\n", - "61 Chinstrap penguin (Pygoscelis antarctica) Dream 4150.0\n", - "66 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", - "67 Chinstrap penguin (Pygoscelis antarctica) Dream 4050.0\n", - "68 Chinstrap penguin (Pygoscelis antarctica) Dream 4100.0\n", - "74 Adelie Penguin (Pygoscelis adeliae) Dream 4350.0\n", - "77 Adelie Penguin (Pygoscelis adeliae) Dream 4150.0\n", - "78 Adelie Penguin (Pygoscelis adeliae) Dream 4400.0\n", - "80 Chinstrap penguin (Pygoscelis antarctica) Dream 4300.0\n", - "90 Chinstrap penguin (Pygoscelis antarctica) Dream 4150.0\n", - "92 Chinstrap penguin (Pygoscelis antarctica) Dream 4500.0\n", - "93 Adelie Penguin (Pygoscelis adeliae) Dream 4300.0\n", - "94 Chinstrap penguin (Pygoscelis antarctica) Dream 4550.0\n", - "...\n", - "\n", - "[177 rows x 3 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['body_mass_g'] >= 4000.0]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using the Remote Functions" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### BigQuery DataFrames gives you the ability to turn your custom scalar functions into a BigQuery remote function.\n", - "\n", - "It requires the GCP project to be set up appropriately and the user having sufficient privileges to use them. One can find more details on it via `help` command." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Help on function remote_function in module bigframes.pandas:\n", - "\n", - "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", - " Decorator to turn a user defined function into a BigQuery remote function.\n", - " \n", - " Args:\n", - " input_types (list(type)):\n", - " List of input data types in the user defined function.\n", - " output_type (type):\n", - " Data type of the output in the user defined function.\n", - " dataset (str, Optional):\n", - " Dataset to use to create a BigQuery function. It should be in\n", - " `.` or `` format. If this\n", - " param is not provided then session dataset id would be used.\n", - " bigquery_connection (str, Optional):\n", - " Name of the BigQuery connection. If it is pre created in the same\n", - " location as the `bigquery_client.location` then it would be used,\n", - " otherwise it would be created dynamically assuming the user has\n", - " necessary priviliges. If this param is not provided then the\n", - " bigquery connection from the session would be used.\n", - " reuse (bool, Optional):\n", - " Reuse the remote function if already exists.\n", - " `True` by default, which will result in reusing an existing remote\n", - " function (if any) that was previously created for the same udf.\n", - " Setting it to false would force creating a unique remote function.\n", - " If the required remote function does not exist then it would be\n", - " created irrespective of this param.\n", - " \n", - " Notes:\n", - " Please make sure following is setup before using this API:\n", - " \n", - " 1. Have the below APIs enabled for your project:\n", - " a. BigQuery Connection API\n", - " b. Cloud Functions API\n", - " c. Cloud Run API\n", - " d. Cloud Build API\n", - " e. Artifact Registry API\n", - " f. Cloud Resource Manager API\n", - " \n", - " This can be done from the cloud console (change PROJECT_ID to yours):\n", - " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n", - " Or from the gcloud CLI:\n", - " $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com\n", - " \n", - " 2. Have following IAM roles enabled for you:\n", - " a. BigQuery Data Editor (roles/bigquery.dataEditor)\n", - " b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", - " c. Cloud Functions Developer (roles/cloudfunctions.developer)\n", - " d. Service Account User (roles/iam.serviceAccountUser)\n", - " e. Storage Object Viewer (roles/storage.objectViewer)\n", - " f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)\n", - " (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", - " \n", - " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n", - " a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n", - " b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n", - " Alternatively, the IAM could also be setup via the gcloud CLI:\n", - " $ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"\n", - "\n" - ] - } - ], - "source": [ - "import bigframes.pandas as pd\n", - "help(pd.remote_function)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Define a custom function, and specify the intent to turn it into a remote function.\n", - "\n", - "It requires a BigQuery connection. If the connection is not already created,\n", - "the BigQuery DataFrames package attempts to create one assuming the necessary\n", - "APIs and IAM permissions are setup in the project." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[INFO][2023-06-28 23:31:49,355][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b3fab64f5997ad6a516379defe8d4202 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmp9w5e89lh --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", - "Preparing function...\n", - ".done.\n", - "Deploying function...\n", - "[Build]..........................................................................................................................................................................................................................................................................................................................................................................................................................done\n", - "[Service].........................................................................................................................................................................................................done\n", - "Done.\n", - "You can view your function in the Cloud Console here: https://console.cloud.google.com/functions/details/us-central1/bigframes-b3fab64f5997ad6a516379defe8d4202?project=bigframes-dev\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "buildConfig:\n", - " build: projects/1084210331973/locations/us-central1/builds/780b1780-9b38-4515-ae60-89d05454ef83\n", - " entryPoint: udf_http\n", - " runtime: python310\n", - " source:\n", - " storageSource:\n", - " bucket: gcf-v2-sources-1084210331973-us-central1\n", - " object: bigframes-b3fab64f5997ad6a516379defe8d4202/function-source.zip\n", - " sourceProvenance:\n", - " resolvedStorageSource:\n", - " bucket: gcf-v2-sources-1084210331973-us-central1\n", - " generation: '1687995112300727'\n", - " object: bigframes-b3fab64f5997ad6a516379defe8d4202/function-source.zip\n", - "environment: GEN_2\n", - "labels:\n", - " deployment-tool: cli-gcloud\n", - "name: projects/bigframes-dev/locations/us-central1/functions/bigframes-b3fab64f5997ad6a516379defe8d4202\n", - "serviceConfig:\n", - " allTrafficOnLatestRevision: true\n", - " availableCpu: '0.1666'\n", - " availableMemory: 256M\n", - " ingressSettings: ALLOW_ALL\n", - " maxInstanceCount: 100\n", - " maxInstanceRequestConcurrency: 1\n", - " revision: bigframes-b3fab64f5997ad6a516379defe8d4202-00001-tut\n", - " service: projects/bigframes-dev/locations/us-central1/services/bigframes-b3fab64f5997ad6a516379defe8d4202\n", - " serviceAccountEmail: 1084210331973-compute@developer.gserviceaccount.com\n", - " timeoutSeconds: 60\n", - " uri: https://bigframes-b3fab64f5997ad6a516379defe8d4202-7krlje3eoq-uc.a.run.app\n", - "state: ACTIVE\n", - "updateTime: '2023-06-28T23:32:51.911131997Z'\n", - "url: https://us-central1-bigframes-dev.cloudfunctions.net/bigframes-b3fab64f5997ad6a516379defe8d4202\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[INFO][2023-06-28 23:32:55,330][bigframes.remote_function] Successfully created cloud function bigframes-b3fab64f5997ad6a516379defe8d4202 with uri (https://bigframes-b3fab64f5997ad6a516379defe8d4202-7krlje3eoq-uc.a.run.app)\n", - "[INFO][2023-06-28 23:32:59,378][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", - "[INFO][2023-06-28 23:32:59,379][bigframes.remote_function] Creating BQ remote function: \n", - " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b3fab64f5997ad6a516379defe8d4202(num FLOAT64)\n", - " RETURNS STRING\n", - " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", - " OPTIONS (\n", - " endpoint = \"https://bigframes-b3fab64f5997ad6a516379defe8d4202-7krlje3eoq-uc.a.run.app/"\n", - " )\n", - "[INFO][2023-06-28 23:33:00,338][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_b3fab64f5997ad6a516379defe8d4202\n" - ] - } - ], - "source": [ - "@pd.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", - "def get_bucket(num):\n", - " if not num: return \"NA\"\n", - " boundary = 4000\n", - " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Run the custom function on the BigQuery-backed dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
body_mass_gbody_mass_bucket
03475.0below_4000
14650.0at_or_above_4000
23900.0below_4000
33500.0below_4000
43000.0below_4000
54100.0at_or_above_4000
62700.0below_4000
73400.0below_4000
83800.0below_4000
93975.0below_4000
\n", - "
[10 rows x 2 columns in total]" - ], - "text/plain": [ - " body_mass_g body_mass_bucket\n", - "0 3475.0 below_4000\n", - "1 4650.0 at_or_above_4000\n", - "2 3900.0 below_4000\n", - "3 3500.0 below_4000\n", - "4 3000.0 below_4000\n", - "5 4100.0 at_or_above_4000\n", - "6 2700.0 below_4000\n", - "7 3400.0 below_4000\n", - "8 3800.0 below_4000\n", - "9 3975.0 below_4000\n", - "\n", - "[10 rows x 2 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.assign(body_mass_bucket=df['body_mass_g'].apply(get_bucket))\n", - "df[['body_mass_g', 'body_mass_bucket']].head(10)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Using the ML API" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Initialize a DataFrame from a BigQuery table" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
speciesislandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Adelie Penguin (Pygoscelis adeliae)Dream36.618.4184.03475.0FEMALE
1Adelie Penguin (Pygoscelis adeliae)Dream39.819.1184.04650.0MALE
2Adelie Penguin (Pygoscelis adeliae)Dream40.918.9184.03900.0MALE
3Chinstrap penguin (Pygoscelis antarctica)Dream46.517.9192.03500.0FEMALE
4Adelie Penguin (Pygoscelis adeliae)Dream37.316.8192.03000.0FEMALE
5Adelie Penguin (Pygoscelis adeliae)Dream43.218.5192.04100.0MALE
6Chinstrap penguin (Pygoscelis antarctica)Dream46.916.6192.02700.0FEMALE
7Chinstrap penguin (Pygoscelis antarctica)Dream50.518.4200.03400.0FEMALE
8Chinstrap penguin (Pygoscelis antarctica)Dream49.519.0200.03800.0MALE
9Adelie Penguin (Pygoscelis adeliae)Dream40.220.1200.03975.0MALE
10Adelie Penguin (Pygoscelis adeliae)Dream40.818.9208.04300.0MALE
11Adelie Penguin (Pygoscelis adeliae)Dream39.018.7185.03650.0MALE
12Adelie Penguin (Pygoscelis adeliae)Dream37.016.9185.03000.0FEMALE
13Chinstrap penguin (Pygoscelis antarctica)Dream47.017.3185.03700.0FEMALE
14Adelie Penguin (Pygoscelis adeliae)Dream34.017.1185.03400.0FEMALE
15Adelie Penguin (Pygoscelis adeliae)Dream37.016.5185.03400.0FEMALE
16Chinstrap penguin (Pygoscelis antarctica)Dream45.717.3193.03600.0FEMALE
17Chinstrap penguin (Pygoscelis antarctica)Dream50.619.4193.03800.0MALE
18Adelie Penguin (Pygoscelis adeliae)Dream39.717.9193.04250.0MALE
19Adelie Penguin (Pygoscelis adeliae)Dream37.818.1193.03750.0MALE
\n", - "
[344 rows x 7 columns in total]" - ], - "text/plain": [ - " species island culmen_length_mm \\\n", - "0 Adelie Penguin (Pygoscelis adeliae) Dream 36.6 \n", - "1 Adelie Penguin (Pygoscelis adeliae) Dream 39.8 \n", - "2 Adelie Penguin (Pygoscelis adeliae) Dream 40.9 \n", - "3 Chinstrap penguin (Pygoscelis antarctica) Dream 46.5 \n", - "4 Adelie Penguin (Pygoscelis adeliae) Dream 37.3 \n", - "5 Adelie Penguin (Pygoscelis adeliae) Dream 43.2 \n", - "6 Chinstrap penguin (Pygoscelis antarctica) Dream 46.9 \n", - "7 Chinstrap penguin (Pygoscelis antarctica) Dream 50.5 \n", - "8 Chinstrap penguin (Pygoscelis antarctica) Dream 49.5 \n", - "9 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "10 Adelie Penguin (Pygoscelis adeliae) Dream 40.8 \n", - "11 Adelie Penguin (Pygoscelis adeliae) Dream 39.0 \n", - "12 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "13 Chinstrap penguin (Pygoscelis antarctica) Dream 47.0 \n", - "14 Adelie Penguin (Pygoscelis adeliae) Dream 34.0 \n", - "15 Adelie Penguin (Pygoscelis adeliae) Dream 37.0 \n", - "16 Chinstrap penguin (Pygoscelis antarctica) Dream 45.7 \n", - "17 Chinstrap penguin (Pygoscelis antarctica) Dream 50.6 \n", - "18 Adelie Penguin (Pygoscelis adeliae) Dream 39.7 \n", - "19 Adelie Penguin (Pygoscelis adeliae) Dream 37.8 \n", - "20 Chinstrap penguin (Pygoscelis antarctica) Dream 46.6 \n", - "21 Chinstrap penguin (Pygoscelis antarctica) Dream 51.3 \n", - "22 Adelie Penguin (Pygoscelis adeliae) Dream 40.2 \n", - "23 Adelie Penguin (Pygoscelis adeliae) Dream 36.8 \n", - "24 Chinstrap penguin (Pygoscelis antarctica) Dream 49.6 \n", - "\n", - " culmen_depth_mm flipper_length_mm body_mass_g sex \n", - "0 18.4 184.0 3475.0 FEMALE \n", - "1 19.1 184.0 4650.0 MALE \n", - "2 18.9 184.0 3900.0 MALE \n", - "3 17.9 192.0 3500.0 FEMALE \n", - "4 16.8 192.0 3000.0 FEMALE \n", - "5 18.5 192.0 4100.0 MALE \n", - "6 16.6 192.0 2700.0 FEMALE \n", - "7 18.4 200.0 3400.0 FEMALE \n", - "8 19.0 200.0 3800.0 MALE \n", - "9 20.1 200.0 3975.0 MALE \n", - "10 18.9 208.0 4300.0 MALE \n", - "11 18.7 185.0 3650.0 MALE \n", - "12 16.9 185.0 3000.0 FEMALE \n", - "13 17.3 185.0 3700.0 FEMALE \n", - "14 17.1 185.0 3400.0 FEMALE \n", - "15 16.5 185.0 3400.0 FEMALE \n", - "16 17.3 193.0 3600.0 FEMALE \n", - "17 19.4 193.0 3800.0 MALE \n", - "18 17.9 193.0 4250.0 MALE \n", - "19 18.1 193.0 3750.0 MALE \n", - "20 17.8 193.0 3800.0 FEMALE \n", - "21 19.2 193.0 3650.0 MALE \n", - "22 17.1 193.0 3400.0 FEMALE \n", - "23 18.5 193.0 3500.0 FEMALE \n", - "24 18.2 193.0 3775.0 MALE \n", - "...\n", - "\n", - "[344 rows x 7 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = bigframes.pandas.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")\n", - "df" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Clean and prepare the data" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
islandculmen_length_mmculmen_depth_mmflipper_length_mmbody_mass_gsex
0Dream36.618.4184.03475.0FEMALE
1Dream39.819.1184.04650.0MALE
2Dream40.918.9184.03900.0MALE
4Dream37.316.8192.03000.0FEMALE
5Dream43.218.5192.04100.0MALE
9Dream40.220.1200.03975.0MALE
10Dream40.818.9208.04300.0MALE
11Dream39.018.7185.03650.0MALE
12Dream37.016.9185.03000.0FEMALE
14Dream34.017.1185.03400.0FEMALE
15Dream37.016.5185.03400.0FEMALE
18Dream39.717.9193.04250.0MALE
19Dream37.818.1193.03750.0MALE
22Dream40.217.1193.03400.0FEMALE
23Dream36.818.5193.03500.0FEMALE
26Dream41.518.5201.04000.0MALE
31Dream33.116.1178.02900.0FEMALE
32Dream37.218.1178.03900.0MALE
33Dream39.516.7178.03250.0FEMALE
35Dream36.018.5186.03100.0FEMALE
\n", - "
[146 rows x 6 columns in total]" - ], - "text/plain": [ - " island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g \\\n", - "0 Dream 36.6 18.4 184.0 3475.0 \n", - "1 Dream 39.8 19.1 184.0 4650.0 \n", - "2 Dream 40.9 18.9 184.0 3900.0 \n", - "4 Dream 37.3 16.8 192.0 3000.0 \n", - "5 Dream 43.2 18.5 192.0 4100.0 \n", - "9 Dream 40.2 20.1 200.0 3975.0 \n", - "10 Dream 40.8 18.9 208.0 4300.0 \n", - "11 Dream 39.0 18.7 185.0 3650.0 \n", - "12 Dream 37.0 16.9 185.0 3000.0 \n", - "14 Dream 34.0 17.1 185.0 3400.0 \n", - "15 Dream 37.0 16.5 185.0 3400.0 \n", - "18 Dream 39.7 17.9 193.0 4250.0 \n", - "19 Dream 37.8 18.1 193.0 3750.0 \n", - "22 Dream 40.2 17.1 193.0 3400.0 \n", - "23 Dream 36.8 18.5 193.0 3500.0 \n", - "26 Dream 41.5 18.5 201.0 4000.0 \n", - "31 Dream 33.1 16.1 178.0 2900.0 \n", - "32 Dream 37.2 18.1 178.0 3900.0 \n", - "33 Dream 39.5 16.7 178.0 3250.0 \n", - "35 Dream 36.0 18.5 186.0 3100.0 \n", - "36 Dream 39.6 18.1 186.0 4450.0 \n", - "38 Dream 41.3 20.3 194.0 3550.0 \n", - "41 Dream 35.7 18.0 202.0 3550.0 \n", - "51 Dream 38.1 17.6 187.0 3425.0 \n", - "53 Dream 36.0 17.1 187.0 3700.0 \n", - "\n", - " sex \n", - "0 FEMALE \n", - "1 MALE \n", - "2 MALE \n", - "4 FEMALE \n", - "5 MALE \n", - "9 MALE \n", - "10 MALE \n", - "11 MALE \n", - "12 FEMALE \n", - "14 FEMALE \n", - "15 FEMALE \n", - "18 MALE \n", - "19 MALE \n", - "22 FEMALE \n", - "23 FEMALE \n", - "26 MALE \n", - "31 FEMALE \n", - "32 MALE \n", - "33 FEMALE \n", - "35 FEMALE \n", - "36 MALE \n", - "38 MALE \n", - "41 FEMALE \n", - "51 FEMALE \n", - "53 FEMALE \n", - "...\n", - "\n", - "[146 rows x 6 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# filter down to the data we want to analyze\n", - "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", - "\n", - "# drop the columns we don't care about\n", - "adelie_data = adelie_data.drop(columns=[\"species\"])\n", - "\n", - "# drop rows with nulls to get our training data\n", - "training_data = adelie_data.dropna()\n", - "\n", - "# take a peek at the training data\n", - "training_data" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "# pick feature columns and label column\n", - "feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "label_columns = training_data[['body_mass_g']]\n", - "\n", - "# also get the rows that we want to make predictions for (i.e. where the feature column is null)\n", - "missing_body_mass = adelie_data[adelie_data.body_mass_g.isnull()]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Train and evaluate a linear regression model using the ML API" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
mean_absolute_errormean_squared_errormean_squared_log_errormedian_absolute_errorr2_scoreexplained_variance
0223.87876378553.6016340.005614181.3309110.6239510.623951
\n", - "
[1 rows x 6 columns in total]" - ], - "text/plain": [ - " mean_absolute_error mean_squared_error mean_squared_log_error \\\n", - "0 223.878763 78553.601634 0.005614 \n", - "\n", - " median_absolute_error r2_score explained_variance \n", - "0 181.330911 0.623951 0.623951 \n", - "\n", - "[1 rows x 6 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from bigframes.ml.linear_model import LinearRegression\n", - "\n", - "# as in scikit-learn, a newly created model is just a bundle of parameters\n", - "# default parameters are fine here\n", - "model = LinearRegression()\n", - "\n", - "# this will train a temporary model in BigQuery Machine Learning\n", - "model.fit(feature_columns, label_columns)\n", - "\n", - "# check how the model performed\n", - "model.score(feature_columns, label_columns)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Make predictions using the model" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
predicted_body_mass_g
2923603.735118
\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " predicted_body_mass_g\n", - "292 3603.735118\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.predict(missing_body_mass)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save the trained model to BigQuery, so we can load it later" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "LinearRegression()" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model.to_gbq(\"bqml_tutorial.penguins_model\", replace=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.9" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/01 - Getting Started.ipynb b/notebooks/01 - Getting Started.ipynb deleted file mode 100644 index 473bdd8cea..0000000000 --- a/notebooks/01 - Getting Started.ipynb +++ /dev/null @@ -1,1190 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "d7a03de2-c0ef-4f80-9cd5-f96e87cf2d54", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# On the instance where you are running jupyter,\n", - "# authenticate with gcloud first:\n", - "#\n", - "# gcloud auth application-default login\n", - "\n", - "import bigframes.pandas as bpd\n", - "\n", - "# Change this location to the location of your datasets.\n", - "# We use \"us\" as that is the location of the sample data.\n", - "bpd.options.bigquery.location = \"us\"" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "941cb6c3-8c54-42ce-a945-4fa604176b2e", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5c480b60490940d3a45fa6b9ca2cecdb", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 411d90c8-8b22-40b0-ad42-04f9e38c074e is DONE. 0 Bytes processed. \n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
taxi_idtrip_end_timestamptrip_secondstrip_milespickup_census_tractdropoff_census_tractpickup_community_areadropoff_community_areafaretips...extrastrip_totalpayment_typecompanypickup_latitudepickup_longitudepickup_locationdropoff_latitudedropoff_longitudedropoff_location
trip_start_timestampunique_key
2023-07-01 00:00:00+00:00fb004dfe98302ccf34e5e11bf1081568a94843a275cf3a53aae5e5858361a7ca64f75d3407dc0a44d7bc42...2023-07-01 00:00:00+00:005621.82<NA><NA>22229.52.53...0.012.03MobileCity Service41.922761-87.699155POINT (-87.6991553432 41.9227606205)41.922761-87.699155POINT (-87.6991553432 41.9227606205)
\n", - "

1 rows × 21 columns

\n", - "[1 rows x 21 columns in total]" - ], - "text/plain": [ - " taxi_id \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 75cf3a53aae5e5858361a7ca64f75d3407dc0a44d7bc42... \n", - "\n", - " trip_end_timestamp \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 2023-07-01 00:00:00+00:00 \n", - "\n", - " trip_seconds \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 562 \n", - "\n", - " trip_miles \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 1.82 \n", - "\n", - " pickup_census_tract \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 \n", - "\n", - " dropoff_census_tract \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 \n", - "\n", - " pickup_community_area \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 22 \n", - "\n", - " dropoff_community_area \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 22 \n", - "\n", - " fare \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 9.5 \n", - "\n", - " tips ... \\\n", - "trip_start_timestamp unique_key ... \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 2.53 ... \n", - "\n", - " extras \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 0.0 \n", - "\n", - " trip_total \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 12.03 \n", - "\n", - " payment_type \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 Mobile \n", - "\n", - " company \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 City Service \n", - "\n", - " pickup_latitude \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 41.922761 \n", - "\n", - " pickup_longitude \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 -87.699155 \n", - "\n", - " pickup_location \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 POINT (-87.6991553432 41.9227606205) \n", - "\n", - " dropoff_latitude \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 41.922761 \n", - "\n", - " dropoff_longitude \\\n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 -87.699155 \n", - "\n", - " dropoff_location \n", - "trip_start_timestamp unique_key \n", - "2023-07-01 00:00:00+00:00 fb004dfe98302ccf34e5e11bf1081568a94843a2 POINT (-87.6991553432 41.9227606205) \n", - "\n", - "[1 rows x 21 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.tail(n=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f1eee1c6-7214-440e-bc17-6839d53a6718", - "metadata": {}, - "outputs": [], - "source": [ - "df = df[[\n", - " \"company\",\n", - " \"trip_miles\",\n", - " \"fare\",\n", - " \"tips\",\n", - "]]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "66071984-e371-4161-8d7e-00d810b5cfab", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "0ce3a9e3bd90477dac718b8e60fd3458", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job b105f8f6-fe3d-4f90-85c7-2cbede41413b is DONE. 0 Bytes processed.
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
companytrip_milesfaretips
trip_start_timestampunique_key
2013-01-01 00:00:00+00:00006d53094ff5b3c81140c00aa91fdb5467db0802<NA>2.37.850.0
00a723cf08cf29c56faa8ef77be519f6590b3405<NA>1.05.650.0
01396f0debefe45ccce23e48dd471edd67bd32e8Dispatch Taxi Affiliation0.08.850.0
01e9a03fd793670ed35ef7195eeb99775895611fDispatch Taxi Affiliation1.87.450.0
024f0f183c43ad2be33f232fd8dca0a86a3e4925<NA>4.914.050.0
04373ef74229c2122ff90d92642fcd94c88302e4Dispatch Taxi Affiliation0.06.650.0
0501a22fba176594d6f974360e20737af1b6cb66<NA>2.27.650.0
05faff951f2a2703fe3eb24efac026c2a1669a92<NA>3.39.650.0
0651a5e1eb4792a53395c3b3b4fd9f7f8383cfbeBlue Ribbon Taxi Association Inc.0.06.250.0
06b4dc6dcd6295e69284e1c9d5a98015b0588ea6Choice Taxi Association0.013.850.0
07483a0baa6df0af68696b6f2483b7f429ff1827Northwest Management LLC0.64.650.0
07585d085f65cd512b94de09684253fe0ffb31a2Taxi Affiliation Services7.04.650.0
090ea9dc5dc26374ee06cdab1b809895beb1befcDispatch Taxi Affiliation0.010.050.0
091eda4ec41520d6ba96003f05eb10744f73aa82Northwest Management LLC0.015.250.0
097f2efed248cb5900792a89bba96a64918345ceBlue Ribbon Taxi Association Inc.0.012.250.0
09ffa3f40d36dbf8d51ca903738d9bba92557ed7Taxi Affiliation Services0.06.450.0
0a3446fb4f72d6485548883cd7f6cfe068d44829Taxi Affiliation Services0.014.250.0
0a7ea707a8700555e898489200a66b546a834170<NA>5.0413.852.0
0be545d9934dd343a2d929fca2e53d3d2851137bDispatch Taxi Affiliation1.15.650.0
0c2b3705c9748a5949bf5b7d727b2d1f2e18805f<NA>12.326.250.0
0d37bbfeb207fbb2353e1911fdf878f3c24ed4c4Taxi Affiliation Services0.015.250.0
0dafcce34426b5377fd3925418a4c24b3e45c7d9<NA>1.26.250.0
0f5f7162b289dd9743b9d2e7fc75bf69696bb7bcTaxi Affiliation Services0.09.850.0
0f63007e437d589c9139acb026f03a832da3240fNorthwest Management LLC1.56.650.0
10508745a1069e9cdaad8760a6a243dc924cc0f2<NA>5.513.250.0
\n", - "

25 rows × 4 columns

\n", - "[208395018 rows x 4 columns in total]" - ], - "text/plain": [ - " company \\\n", - "trip_start_timestamp unique_key \n", - "2013-01-01 00:00:00+00:00 006d53094ff5b3c81140c00aa91fdb5467db0802 \n", - " 00a723cf08cf29c56faa8ef77be519f6590b3405 \n", - " 01396f0debefe45ccce23e48dd471edd67bd32e8 Dispatch Taxi Affiliation \n", - " 01e9a03fd793670ed35ef7195eeb99775895611f Dispatch Taxi Affiliation \n", - " 024f0f183c43ad2be33f232fd8dca0a86a3e4925 \n", - " 04373ef74229c2122ff90d92642fcd94c88302e4 Dispatch Taxi Affiliation \n", - " 0501a22fba176594d6f974360e20737af1b6cb66 \n", - " 05faff951f2a2703fe3eb24efac026c2a1669a92 \n", - " 0651a5e1eb4792a53395c3b3b4fd9f7f8383cfbe Blue Ribbon Taxi Association Inc. \n", - " 06b4dc6dcd6295e69284e1c9d5a98015b0588ea6 Choice Taxi Association \n", - " 07483a0baa6df0af68696b6f2483b7f429ff1827 Northwest Management LLC \n", - " 07585d085f65cd512b94de09684253fe0ffb31a2 Taxi Affiliation Services \n", - " 090ea9dc5dc26374ee06cdab1b809895beb1befc Dispatch Taxi Affiliation \n", - " 091eda4ec41520d6ba96003f05eb10744f73aa82 Northwest Management LLC \n", - " 097f2efed248cb5900792a89bba96a64918345ce Blue Ribbon Taxi Association Inc. \n", - " 09ffa3f40d36dbf8d51ca903738d9bba92557ed7 Taxi Affiliation Services \n", - " 0a3446fb4f72d6485548883cd7f6cfe068d44829 Taxi Affiliation Services \n", - " 0a7ea707a8700555e898489200a66b546a834170 \n", - " 0be545d9934dd343a2d929fca2e53d3d2851137b Dispatch Taxi Affiliation \n", - " 0c2b3705c9748a5949bf5b7d727b2d1f2e18805f \n", - " 0d37bbfeb207fbb2353e1911fdf878f3c24ed4c4 Taxi Affiliation Services \n", - " 0dafcce34426b5377fd3925418a4c24b3e45c7d9 \n", - " 0f5f7162b289dd9743b9d2e7fc75bf69696bb7bc Taxi Affiliation Services \n", - " 0f63007e437d589c9139acb026f03a832da3240f Northwest Management LLC \n", - " 10508745a1069e9cdaad8760a6a243dc924cc0f2 \n", - "\n", - " trip_miles \\\n", - "trip_start_timestamp unique_key \n", - "2013-01-01 00:00:00+00:00 006d53094ff5b3c81140c00aa91fdb5467db0802 2.3 \n", - " 00a723cf08cf29c56faa8ef77be519f6590b3405 1.0 \n", - " 01396f0debefe45ccce23e48dd471edd67bd32e8 0.0 \n", - " 01e9a03fd793670ed35ef7195eeb99775895611f 1.8 \n", - " 024f0f183c43ad2be33f232fd8dca0a86a3e4925 4.9 \n", - " 04373ef74229c2122ff90d92642fcd94c88302e4 0.0 \n", - " 0501a22fba176594d6f974360e20737af1b6cb66 2.2 \n", - " 05faff951f2a2703fe3eb24efac026c2a1669a92 3.3 \n", - " 0651a5e1eb4792a53395c3b3b4fd9f7f8383cfbe 0.0 \n", - " 06b4dc6dcd6295e69284e1c9d5a98015b0588ea6 0.0 \n", - " 07483a0baa6df0af68696b6f2483b7f429ff1827 0.6 \n", - " 07585d085f65cd512b94de09684253fe0ffb31a2 7.0 \n", - " 090ea9dc5dc26374ee06cdab1b809895beb1befc 0.0 \n", - " 091eda4ec41520d6ba96003f05eb10744f73aa82 0.0 \n", - " 097f2efed248cb5900792a89bba96a64918345ce 0.0 \n", - " 09ffa3f40d36dbf8d51ca903738d9bba92557ed7 0.0 \n", - " 0a3446fb4f72d6485548883cd7f6cfe068d44829 0.0 \n", - " 0a7ea707a8700555e898489200a66b546a834170 5.04 \n", - " 0be545d9934dd343a2d929fca2e53d3d2851137b 1.1 \n", - " 0c2b3705c9748a5949bf5b7d727b2d1f2e18805f 12.3 \n", - " 0d37bbfeb207fbb2353e1911fdf878f3c24ed4c4 0.0 \n", - " 0dafcce34426b5377fd3925418a4c24b3e45c7d9 1.2 \n", - " 0f5f7162b289dd9743b9d2e7fc75bf69696bb7bc 0.0 \n", - " 0f63007e437d589c9139acb026f03a832da3240f 1.5 \n", - " 10508745a1069e9cdaad8760a6a243dc924cc0f2 5.5 \n", - "\n", - " fare \\\n", - "trip_start_timestamp unique_key \n", - "2013-01-01 00:00:00+00:00 006d53094ff5b3c81140c00aa91fdb5467db0802 7.85 \n", - " 00a723cf08cf29c56faa8ef77be519f6590b3405 5.65 \n", - " 01396f0debefe45ccce23e48dd471edd67bd32e8 8.85 \n", - " 01e9a03fd793670ed35ef7195eeb99775895611f 7.45 \n", - " 024f0f183c43ad2be33f232fd8dca0a86a3e4925 14.05 \n", - " 04373ef74229c2122ff90d92642fcd94c88302e4 6.65 \n", - " 0501a22fba176594d6f974360e20737af1b6cb66 7.65 \n", - " 05faff951f2a2703fe3eb24efac026c2a1669a92 9.65 \n", - " 0651a5e1eb4792a53395c3b3b4fd9f7f8383cfbe 6.25 \n", - " 06b4dc6dcd6295e69284e1c9d5a98015b0588ea6 13.85 \n", - " 07483a0baa6df0af68696b6f2483b7f429ff1827 4.65 \n", - " 07585d085f65cd512b94de09684253fe0ffb31a2 4.65 \n", - " 090ea9dc5dc26374ee06cdab1b809895beb1befc 10.05 \n", - " 091eda4ec41520d6ba96003f05eb10744f73aa82 15.25 \n", - " 097f2efed248cb5900792a89bba96a64918345ce 12.25 \n", - " 09ffa3f40d36dbf8d51ca903738d9bba92557ed7 6.45 \n", - " 0a3446fb4f72d6485548883cd7f6cfe068d44829 14.25 \n", - " 0a7ea707a8700555e898489200a66b546a834170 13.85 \n", - " 0be545d9934dd343a2d929fca2e53d3d2851137b 5.65 \n", - " 0c2b3705c9748a5949bf5b7d727b2d1f2e18805f 26.25 \n", - " 0d37bbfeb207fbb2353e1911fdf878f3c24ed4c4 15.25 \n", - " 0dafcce34426b5377fd3925418a4c24b3e45c7d9 6.25 \n", - " 0f5f7162b289dd9743b9d2e7fc75bf69696bb7bc 9.85 \n", - " 0f63007e437d589c9139acb026f03a832da3240f 6.65 \n", - " 10508745a1069e9cdaad8760a6a243dc924cc0f2 13.25 \n", - "\n", - " tips \n", - "trip_start_timestamp unique_key \n", - "2013-01-01 00:00:00+00:00 006d53094ff5b3c81140c00aa91fdb5467db0802 0.0 \n", - " 00a723cf08cf29c56faa8ef77be519f6590b3405 0.0 \n", - " 01396f0debefe45ccce23e48dd471edd67bd32e8 0.0 \n", - " 01e9a03fd793670ed35ef7195eeb99775895611f 0.0 \n", - " 024f0f183c43ad2be33f232fd8dca0a86a3e4925 0.0 \n", - " 04373ef74229c2122ff90d92642fcd94c88302e4 0.0 \n", - " 0501a22fba176594d6f974360e20737af1b6cb66 0.0 \n", - " 05faff951f2a2703fe3eb24efac026c2a1669a92 0.0 \n", - " 0651a5e1eb4792a53395c3b3b4fd9f7f8383cfbe 0.0 \n", - " 06b4dc6dcd6295e69284e1c9d5a98015b0588ea6 0.0 \n", - " 07483a0baa6df0af68696b6f2483b7f429ff1827 0.0 \n", - " 07585d085f65cd512b94de09684253fe0ffb31a2 0.0 \n", - " 090ea9dc5dc26374ee06cdab1b809895beb1befc 0.0 \n", - " 091eda4ec41520d6ba96003f05eb10744f73aa82 0.0 \n", - " 097f2efed248cb5900792a89bba96a64918345ce 0.0 \n", - " 09ffa3f40d36dbf8d51ca903738d9bba92557ed7 0.0 \n", - " 0a3446fb4f72d6485548883cd7f6cfe068d44829 0.0 \n", - " 0a7ea707a8700555e898489200a66b546a834170 2.0 \n", - " 0be545d9934dd343a2d929fca2e53d3d2851137b 0.0 \n", - " 0c2b3705c9748a5949bf5b7d727b2d1f2e18805f 0.0 \n", - " 0d37bbfeb207fbb2353e1911fdf878f3c24ed4c4 0.0 \n", - " 0dafcce34426b5377fd3925418a4c24b3e45c7d9 0.0 \n", - " 0f5f7162b289dd9743b9d2e7fc75bf69696bb7bc 0.0 \n", - " 0f63007e437d589c9139acb026f03a832da3240f 0.0 \n", - " 10508745a1069e9cdaad8760a6a243dc924cc0f2 0.0 \n", - "...\n", - "\n", - "[208395018 rows x 4 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "e51b687e-0282-459c-8fd0-0ca22cfb153d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bc4d2a7cee5e4fc1a9513ec2fb54d293", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HTML(value='Query job 494b3016-0582-45e0-9fcf-bb38eb00722e is RUNNING.
\n", + " \n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + " \n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24743cf4a1e1" + }, + "source": [ + "**_NOTE_**: This notebook has been tested in the following environment:\n", + "\n", + "* Python version = 3.9" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "The goal of this notebook is to demonstrate an enterprise generative AI use case. A marketing user can provide information about a new pharmaceutical drug and its generic name, and receive ideas on marketing-oriented brand names for that drug.\n", + "\n", + "Learn more about [BigQuery DataFrames](https://cloud.google.com/bigquery/docs/dataframes-quickstart)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn about Generative AI concepts such as prompting and few-shot learning, as well as how to use BigFrames ML for performing these tasks simply using an intuitive dataframe API.\n", + "\n", + "The steps performed include:\n", + "\n", + "1. Ask the user for the generic name and usage for the drug.\n", + "1. Use `bigframes` to query the FDA dataset of over 100,000 drugs, filtered on the brand name, generic name, and indications & usage columns.\n", + "1. Filter this dataset to find prototypical brand names that can be used as examples in prompt tuning.\n", + "1. Create a prompt with the user input, general instructions, examples and counter-examples for the desired brand name.\n", + "1. Use the `bigframes.ml.llm.PaLM2TextGenerator` to generate choices of brand names." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This notebook uses the [FDA dataset](https://cloud.google.com/blog/topics/healthcare-life-sciences/fda-mystudies-comes-to-google-cloud) available at [`bigquery-public-data.fda_drug`](https://console.cloud.google.com/bigquery?ws=!1m4!1m3!3m2!1sbigquery-public-data!2sfda_drug)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (compute)\n", + "* BigQuery ML\n", + "\n", + "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models),\n", + "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages required to execute this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "!pip install -U --quiet bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only: Uncomment the following cell to restart the kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# # Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bpd\n", + "from google.cloud import bigquery_connection_v1 as bq_connection\n", + "from bigframes.ml.llm import PaLM2TextGenerator\n", + "from IPython.display import Markdown" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you may have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**1. Vertex AI Workbench**\n", + "* Do nothing as you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**2. Local JupyterLab instance, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**3. Colab, uncomment and run:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 free credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the BigQuery API](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, you need to install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "**If you don't know your project ID**, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "evsJaAj5te0X" + }, + "source": [ + "#### BigFrames configuration\n", + "\n", + "Next, we will specify a [BigQuery connection](https://cloud.google.com/bigquery/docs/working-with-connections). If you already have a connection, you can simplify provide the name and skip the following creation steps.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "G1vVsPiMsL2X" + }, + "outputs": [], + "source": [ + "# Please fill in these values.\n", + "LOCATION = \"us\" # @param {type:\"string\"}\n", + "CONNECTION = \"\" # @param {type:\"string\"}\n", + "\n", + "connection_name = f\"{PROJECT_ID}.{LOCATION}.{CONNECTION}\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WGS_TzhWlPBN" + }, + "source": [ + "We will now try to use the provided connection, and if it doesn't exist, create a new one. We will also print the service account used." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "56Hw42m6kFrj" + }, + "outputs": [], + "source": [ + "# Initialize client and set request parameters\n", + "client = bq_connection.ConnectionServiceClient()\n", + "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{LOCATION}\"\n", + "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{LOCATION}/connections/{CONNECTION}\"\n", + "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n", + "\n", + "# Try to connect using provided connection\n", + "try:\n", + " request = client.get_connection(\n", + " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n", + " )\n", + " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n", + "# Create a new connection on error\n", + "except Exception:\n", + " connection = bq_connection.types.Connection(\n", + " {\"friendly_name\": CONNECTION, \"cloud_resource\": cloud_resource_properties}\n", + " )\n", + " request = bq_connection.CreateConnectionRequest(\n", + " {\n", + " \"parent\": new_conn_parent,\n", + " \"connection_id\": CONNECTION,\n", + " \"connection\": connection,\n", + " }\n", + " )\n", + " response = client.create_connection(request)\n", + " CONN_SERVICE_ACCOUNT = (\n", + " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n", + " )\n", + "# Set service account permissions\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'\n", + "\n", + "print(CONN_SERVICE_ACCOUNT)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "### Initialize BigFrames client\n", + "\n", + "Here, we set the project configuration based on the provided parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OCccLirpkSRz" + }, + "outputs": [], + "source": [ + "bpd.options.bigquery.project = PROJECT_ID\n", + "bpd.options.bigquery.location = LOCATION" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "m8UCEtX9uLn6" + }, + "source": [ + "## Generate a name\n", + "\n", + "Let's start with entering a generic name and description of the drug." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oxphj2gnuKou" + }, + "outputs": [], + "source": [ + "GENERIC_NAME = \"Entropofloxacin\" # @param {type:\"string\"}\n", + "USAGE = \"Entropofloxacin is a fluoroquinolone antibiotic that is used to treat a variety of bacterial infections, including: pneumonia, streptococcus infections, salmonella infections, escherichia coli infections, and pseudomonas aeruginosa infections It is taken by mouth or by injection. The dosage and frequency of administration will vary depending on the type of infection being treated. It should be taken for the full course of treatment, even if symptoms improve after a few days. Stopping the medication early may increase the risk of the infection coming back.\" # @param {type:\"string\"}\n", + "NUM_NAMES = 10 # @param {type:\"integer\"}\n", + "TEMPERATURE = 0.5 # @param {type: \"number\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1q-vlbalzu1Q" + }, + "source": [ + "We can now create a prompt string, and populate it with the name and description." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0knz5ZWMzed-" + }, + "outputs": [], + "source": [ + "zero_shot_prompt = f\"\"\"Provide {NUM_NAMES} unique and modern brand names in Markdown bullet point format. Do not provide any additional explanation.\n", + "\n", + "Be creative with the brand names. Don't use English words directly; use variants or invented words.\n", + "\n", + "The generic name is: {GENERIC_NAME}\n", + "\n", + "The indications and usage are: {USAGE}.\"\"\"\n", + "\n", + "print(zero_shot_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LCRE2L720f5y" + }, + "source": [ + "Next, let's create a helper function to predict with our model. It will take a string input, and add it to a temporary BigFrames `DataFrame`. It will also return the string extracted from the response `DataFrame`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LB3xgDroIxlx" + }, + "outputs": [], + "source": [ + "def predict(prompt: str, temperature: float = TEMPERATURE) -> str:\n", + " # Create dataframe\n", + " input = bpd.DataFrame(\n", + " {\n", + " \"prompt\": [prompt],\n", + " }\n", + " )\n", + "\n", + " # Return response\n", + " return model.predict(input, temperature).ml_generate_text_llm_result.iloc[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "b1ZapNZsJW2p" + }, + "source": [ + "We can now initialize the model, and get a response to our prompt!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UW2fQ2k5Hsic" + }, + "outputs": [], + "source": [ + "# Get BigFrames session\n", + "session = bpd.get_global_session()\n", + "\n", + "# Define the model\n", + "model = PaLM2TextGenerator(session=session, connection_name=connection_name)\n", + "\n", + "# Invoke LLM with prompt\n", + "response = predict(zero_shot_prompt)\n", + "\n", + "# Print results as Markdown\n", + "Markdown(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o3yIhHV2jsUT" + }, + "source": [ + "We're off to a great start! Let's see if we can refine our response." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mBroUzWS8xOL" + }, + "source": [ + "## Few-shot learning\n", + "\n", + "Let's try using [few-shot learning](https://paperswithcode.com/task/few-shot-learning). We will provide a few examples of what we're looking for along with our prompt.\n", + "\n", + "Our prompt will consist of 3 parts:\n", + "* General instructions (e.g. generate $n$ brand names)\n", + "* Multiple examples\n", + "* Information about the drug we'd like to generate a name for\n", + "\n", + "Let's walk through how to construct this prompt.\n", + "\n", + "Our first step will be to define how many examples we want to provide in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MXdI78SOElyt" + }, + "outputs": [], + "source": [ + "# Specify number of examples to include\n", + "\n", + "NUM_EXAMPLES = 3 # @param {type:\"integer\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U8w4puVM_892" + }, + "source": [ + "Next, let's define a prefix that will set the overall context." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "aQ2iscnhF2cx" + }, + "outputs": [], + "source": [ + "prefix_prompt = f\"\"\"Provide {NUM_NAMES} unique and modern brand names in Markdown bullet point format, related to the drug at the bottom of this prompt.\n", + "\n", + "Be creative with the brand names. Don't use English words directly; use variants or invented words.\n", + "\n", + "First, we will provide {NUM_EXAMPLES} examples to help with your thought process.\n", + "\n", + "Then, we will provide the generic name and usage for the drug we'd like you to generate brand names for.\n", + "\"\"\"\n", + "\n", + "print(prefix_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VI0Spv-axN7d" + }, + "source": [ + "Our next step will be to include examples into the prompt.\n", + "\n", + "We will start out by retrieving the raw data for the examples, by querying the BigQuery public dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IoO_Bp8wA07N" + }, + "outputs": [], + "source": [ + "# Query 3 columns of interest from drug label dataset\n", + "df = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", + " col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", + "\n", + "# Exclude any rows with missing data\n", + "df = df.dropna()\n", + "\n", + "# Drop duplicate rows\n", + "df = df.drop_duplicates()\n", + "\n", + "# Print values\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W5kOtbNGBTI2" + }, + "source": [ + "Let's now filter the results to remove atypical names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "95WDe2eCCeLx" + }, + "outputs": [], + "source": [ + "# Remove names with spaces\n", + "df = df[df[\"openfda_brand_name\"].str.find(\" \") == -1]\n", + "\n", + "# Remove names with 5 or fewer characters\n", + "df = df[df[\"openfda_brand_name\"].str.len() > 5]\n", + "\n", + "# Remove names where the generic and brand name match (case-insensitive)\n", + "df = df[df[\"openfda_generic_name\"].str.lower() != df[\"openfda_brand_name\"].str.lower()]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FZD89ep4EyYc" + }, + "source": [ + "Let's take `NUM_EXAMPLES` samples to include in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2ohZYg7QEyJV" + }, + "outputs": [], + "source": [ + "# Take a sample and convert to a Pandas dataframe for local usage.\n", + "df_examples = df.sample(NUM_EXAMPLES, random_state=3).to_pandas()\n", + "\n", + "df_examples" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J-Qa1_SCImXy" + }, + "source": [ + "Let's now convert the data to a JSON structure, to enable embedding into a prompt. For consistency, we'll capitalize each example brand name." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PcJdSaw0EGcW" + }, + "outputs": [], + "source": [ + "examples = [\n", + " {\n", + " \"brand_name\": brand_name.capitalize(),\n", + " \"generic_name\": generic_name,\n", + " \"usage\": usage,\n", + " }\n", + " for brand_name, generic_name, usage in zip(\n", + " df_examples[\"openfda_brand_name\"],\n", + " df_examples[\"openfda_generic_name\"],\n", + " df_examples[\"indications_and_usage\"],\n", + " )\n", + "]\n", + "\n", + "print(examples)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oU4mb1Dwgq64" + }, + "source": [ + "We'll create a prompt template for each example, and view the first one." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kzAVsF6wJ93S" + }, + "outputs": [], + "source": [ + "example_prompt = \"\"\n", + "for example in examples:\n", + " example_prompt += f\"Generic name: {example['generic_name']}\\nUsage: {example['usage']}\\nBrand name: {example['brand_name']}\\n\\n\"\n", + "\n", + "example_prompt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kbV2X1CXAyLV" + }, + "source": [ + "Finally, we can create a suffix to our prompt. This will contain the generic name of the drug, its usage, ending with a request for brand names." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "OYp6W_XfHTlo" + }, + "outputs": [], + "source": [ + "suffix_prompt = f\"\"\"Generic name: {GENERIC_NAME}\n", + "Usage: {USAGE}\n", + "Brand names:\"\"\"\n", + "\n", + "print(suffix_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RiaisW1nihJP" + }, + "source": [ + "Let's pull it altogether into a few shot prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "99xdU7l8C1h8" + }, + "outputs": [], + "source": [ + "# Define the prompt\n", + "few_shot_prompt = prefix_prompt + example_prompt + suffix_prompt\n", + "\n", + "# Print the prompt\n", + "print(few_shot_prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nbUWdHtfitWn" + }, + "source": [ + "Now, let's pass our prompt to the LLM, and get a response!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "d4ODRJdvLhlQ" + }, + "outputs": [], + "source": [ + "response = predict(few_shot_prompt)\n", + "\n", + "Markdown(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pFakjrTElOBs" + }, + "source": [ + "# Bulk generation\n", + "\n", + "Let's take these experiments to the next level by generating many names in bulk. We'll see how to leverage BigFrames at scale!\n", + "\n", + "We can start by finding drugs that are missing brand names. There are approximately 4,000 drugs that meet this criteria. We'll put a limit of 100 in this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8eAutS41mx6U" + }, + "outputs": [], + "source": [ + "# Query 3 columns of interest from drug label dataset\n", + "df_missing = bpd.read_gbq(\"bigquery-public-data.fda_drug.drug_label\",\n", + " col_order=[\"openfda_generic_name\", \"openfda_brand_name\", \"indications_and_usage\"])\n", + "\n", + "# Exclude any rows with missing data\n", + "df_missing = df_missing.dropna()\n", + "\n", + "# Include rows in which openfda_brand_name equals openfda_generic_name\n", + "df_missing = df_missing[df_missing[\"openfda_generic_name\"] == df_missing[\"openfda_brand_name\"]]\n", + "\n", + "# Limit the number of rows for demonstration purposes\n", + "df_missing = df_missing.head(100)\n", + "\n", + "# Print values\n", + "df_missing.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fm6L8S7eVnCI" + }, + "source": [ + "We will create a column `prompt` with a customized prompt for each row." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "19TvGN1PVmVX" + }, + "outputs": [], + "source": [ + "df_missing[\"prompt\"] = (\n", + " \"Provide a unique and modern brand name related to this pharmaceutical drug.\"\n", + " + \"Don't use English words directly; use variants or invented words. The generic name is: \"\n", + " + df_missing[\"openfda_generic_name\"]\n", + " + \". The indications and usage are: \"\n", + " + df_missing[\"indications_and_usage\"]\n", + " + \".\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "njxwBvCKgMPE" + }, + "source": [ + "We'll create a new helper method, `batch_predict()` and query the LLM. The job may take a couple minutes to execute." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tiSHa5B4aFhw" + }, + "outputs": [], + "source": [ + "def batch_predict(\n", + " input: bpd.DataFrame, temperature: float = TEMPERATURE\n", + ") -> bpd.DataFrame:\n", + " return model.predict(input, temperature).ml_generate_text_llm_result\n", + "\n", + "\n", + "response = batch_predict(df_missing[\"prompt\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K5a2nHdLgZEj" + }, + "source": [ + "Let's check the results for one of our responses!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "TnizdeqBdbZj" + }, + "outputs": [], + "source": [ + "# Pick a sample\n", + "k = 0\n", + "\n", + "# Gather the prompt and response details\n", + "prompt_generic = df_missing[\"openfda_generic_name\"][k].iloc[0]\n", + "prompt_usage = df_missing[\"indications_and_usage\"][k].iloc[0]\n", + "response_str = response[k].iloc[0]\n", + "\n", + "# Print details\n", + "print(f\"Generic name: {prompt_generic}\")\n", + "print(f\"Brand name: {prompt_usage}\")\n", + "print(f\"Response: {response_str}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "W4MviwyMI-Qh" + }, + "source": [ + "Congratulations! You have learned how to use generative AI to jumpstart the creative process.\n", + "\n", + "You've also seen how BigFrames can manage each step of the process, including gathering data, data manipulation, and querying the LLM." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Bys6--dVmq7R" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cIODjOLump_-" + }, + "outputs": [], + "source": [ + "# Delete the BigQuery Connection\n", + "from google.cloud import bigquery_connection_v1 as bq_connection\n", + "client = bq_connection.ConnectionServiceClient()\n", + "CONNECTION_ID = f\"projects/{PROJECT_ID}/locations/{LOCATION}/connections/{CONNECTION}\"\n", + "client.delete_connection(name=CONNECTION_ID)\n", + "print(f\"Deleted connection {CONNECTION_ID}.\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/06 - Using ML - Large Language Models.ipynb b/notebooks/generative_ai/large_language_models.ipynb similarity index 100% rename from notebooks/06 - Using ML - Large Language Models.ipynb rename to notebooks/generative_ai/large_language_models.ipynb diff --git a/notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb b/notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb new file mode 100644 index 0000000000..39e2ef535c --- /dev/null +++ b/notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2022 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "## Use BigQuery DataFrames with Generative AI for code generation\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24743cf4a1e1" + }, + "source": [ + "**_NOTE_**: This notebook has been tested in the following environment:\n", + "\n", + "* Python version = 3.10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Use this notebook to walk through an example use case of generating sample code by using BigQuery DataFrames and its integration with Generative AI support on Vertex AI.\n", + "\n", + "Learn more about [BigQuery DataFrames](https://cloud.google.com/python/docs/reference/bigframes/latest)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you create a CSV file containing sample code for calling a given set of APIs.\n", + "\n", + "The steps include:\n", + "\n", + "- Defining an LLM model in BigQuery DataFrames, specifically the [`text-bison` model of the PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text), using `bigframes.ml.llm`.\n", + "- Creating a DataFrame by reading in data from Cloud Storage.\n", + "- Manipulating data in the DataFrame to build LLM prompts.\n", + "- Sending DataFrame prompts to the LLM model using the `predict` method.\n", + "- Creating and using a custom function to transform the output provided by the LLM model response.\n", + "- Exporting the resulting transformed DataFrame as a CSV file." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses a dataset listing the names of various pandas DataFrame and Series APIs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery\n", + "* Generative AI support on Vertex AI\n", + "* Cloud Functions\n", + "\n", + "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models),\n", + "[Generative AI support on Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing#generative_ai_models), and [Cloud Functions pricing](https://cloud.google.com/functions/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages, which are required to run this notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "2b4ef9b72d43" + }, + "outputs": [], + "source": [ + "!pip install bigframes --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n", + "\n", + " * BigQuery API\n", + " * BigQuery Connection API\n", + " * Cloud Functions API\n", + " * Cloud Run API\n", + " * Artifact Registry API\n", + " * Cloud Build API\n", + " * Cloud Resource Manager API\n", + " * Vertex AI API\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ], + "metadata": { + "id": "Wbr2aVtFQBcg" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eF-Twtc4XGem" + }, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf\n", + "from google.cloud import bigquery\n", + "from google.cloud import bigquery_connection_v1 as bq_connection" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "### Set BigQuery DataFrames options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NPPMuw2PXGeo" + }, + "outputs": [], + "source": [ + "bf.options.bigquery.project = PROJECT_ID\n", + "bf.options.bigquery.location = REGION" + ] + }, + { + "cell_type": "markdown", + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.reset_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ], + "metadata": { + "id": "DTVtFlqeFbrU" + } + }, + { + "cell_type": "markdown", + "source": [ + "# Define the LLM model\n", + "\n", + "BigQuery DataFrames provides integration with [`text-bison` model of the PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text) via Vertex AI.\n", + "\n", + "This section walks through a few steps required in order to use the model in your notebook." + ], + "metadata": { + "id": "6eytf4xQHzcF" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Create a BigQuery Cloud resource connection\n", + "\n", + "You need to create a [Cloud resource connection](https://cloud.google.com/bigquery/docs/create-cloud-resource-connection) to enable BigQuery DataFrames to interact with Vertex AI services." + ], + "metadata": { + "id": "rS4VO1TGiO4G" + } + }, + { + "cell_type": "code", + "source": [ + "CONN_NAME = \"bqdf-llm\"\n", + "\n", + "client = bq_connection.ConnectionServiceClient()\n", + "new_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}\"\n", + "exists_conn_parent = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n", + "cloud_resource_properties = bq_connection.CloudResourceProperties({})\n", + "\n", + "try:\n", + " request = client.get_connection(\n", + " request=bq_connection.GetConnectionRequest(name=exists_conn_parent)\n", + " )\n", + " CONN_SERVICE_ACCOUNT = f\"serviceAccount:{request.cloud_resource.service_account_id}\"\n", + "except Exception:\n", + " connection = bq_connection.types.Connection(\n", + " {\"friendly_name\": CONN_NAME, \"cloud_resource\": cloud_resource_properties}\n", + " )\n", + " request = bq_connection.CreateConnectionRequest(\n", + " {\n", + " \"parent\": new_conn_parent,\n", + " \"connection_id\": CONN_NAME,\n", + " \"connection\": connection,\n", + " }\n", + " )\n", + " response = client.create_connection(request)\n", + " CONN_SERVICE_ACCOUNT = (\n", + " f\"serviceAccount:{response.cloud_resource.service_account_id}\"\n", + " )\n", + "print(CONN_SERVICE_ACCOUNT)" + ], + "metadata": { + "id": "KFPjDM4LVh96" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Set permissions for the service account\n", + "\n", + "The resource connection service account requires certain project-level permissions:\n", + " - `roles/aiplatform.user` and `roles/bigquery.connectionUser`: These roles are required for the connection to create a model definition using the LLM model in Vertex AI ([documentation](https://cloud.google.com/bigquery/docs/generate-text#give_the_service_account_access)).\n", + " - `roles/run.invoker`: This role is required for the connection to have read-only access to Cloud Run services that back custom/remote functions ([documentation](https://cloud.google.com/bigquery/docs/remote-functions#grant_permission_on_function)).\n", + "\n", + "Set these permissions by running the following `gcloud` commands:" + ], + "metadata": { + "id": "W6l6Ol2biU9h" + } + }, + { + "cell_type": "code", + "source": [ + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/bigquery.connectionUser'\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/aiplatform.user'\n", + "!gcloud projects add-iam-policy-binding {PROJECT_ID} --condition=None --no-user-output-enabled --member={CONN_SERVICE_ACCOUNT} --role='roles/run.invoker'" + ], + "metadata": { + "id": "d8wja24SVq6s" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Define the model\n", + "\n", + "Use `bigframes.ml.llm` to define the model:" + ], + "metadata": { + "id": "qUjT8nw-jIXp" + } + }, + { + "cell_type": "code", + "source": [ + "from bigframes.ml.llm import PaLM2TextGenerator\n", + "\n", + "session = bf.get_global_session()\n", + "connection = f\"{PROJECT_ID}.{REGION}.{CONN_NAME}\"\n", + "model = PaLM2TextGenerator(session=session, connection_name=connection)" + ], + "metadata": { + "id": "sdjeXFwcHfl7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Read data from Cloud Storage into BigQuery DataFrames\n", + "\n", + "You can create a BigQuery DataFrames DataFrame by reading data from any of the following locations:\n", + "\n", + "* A local data file\n", + "* Data stored in a BigQuery table\n", + "* A data file stored in Cloud Storage\n", + "* An in-memory pandas DataFrame\n", + "\n", + "In this tutorial, you create BigQuery DataFrames DataFrames by reading two CSV files stored in Cloud Storage, one containing a list of DataFrame API names and one containing a list of Series API names." + ], + "metadata": { + "id": "GbW0oCnU1s1N" + } + }, + { + "cell_type": "code", + "source": [ + "df_api = bf.read_csv(\"gs://cloud-samples-data/vertex-ai/bigframe/df.csv\")\n", + "series_api = bf.read_csv(\"gs://cloud-samples-data/vertex-ai/bigframe/series.csv\")" + ], + "metadata": { + "id": "SchiTkQGIJog" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Take a peek at a few rows of data for each file:" + ], + "metadata": { + "id": "7OBjw2nmQY3-" + } + }, + { + "cell_type": "code", + "source": [ + "df_api.head(2)" + ], + "metadata": { + "id": "QCqgVCIsGGuv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "series_api.head(2)" + ], + "metadata": { + "id": "BGJnZbgEGS5-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Generate code using the LLM model\n", + "\n", + "Prepare the prompts and send them to the LLM model for prediction." + ], + "metadata": { + "id": "m3ZJEsi7SUKV" + } + }, + { + "cell_type": "markdown", + "source": [ + "## Prompt design in BigQuery DataFrames\n", + "\n", + "Designing prompts for LLMs is a fast growing area and you can read more in [this documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/introduction-prompt-design).\n", + "\n", + "For this tutorial, you use a simple prompt to ask the LLM model for sample code for each of the API methods (or rows) from the last step's DataFrames. The output is the new DataFrames `df_prompt` and `series_prompt`, which contain the full prompt text." + ], + "metadata": { + "id": "9EMAqR37AfLS" + } + }, + { + "cell_type": "code", + "source": [ + "df_prompt_prefix = \"Generate Pandas sample code for DataFrame.\"\n", + "series_prompt_prefix = \"Generate Pandas sample code for Series.\"\n", + "\n", + "df_prompt = (df_prompt_prefix + df_api['API'])\n", + "series_prompt = (series_prompt_prefix + series_api['API'])\n", + "\n", + "df_prompt.head(2)" + ], + "metadata": { + "id": "EDAaIwHpQCDZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## Make predictions using the LLM model\n", + "\n", + "Use the BigQuery DataFrames DataFrame containing the full prompt text as the input to the `predict` method. The `predict` method calls the LLM model and returns its generated text output back to two new BigQuery DataFrames DataFrames, `df_pred` and `series_pred`.\n", + "\n", + "Note: The predictions might take a few minutes to run." + ], + "metadata": { + "id": "rwPLjqW2Ajzh" + } + }, + { + "cell_type": "code", + "source": [ + "df_pred = model.predict(df_prompt.to_frame(), max_output_tokens=1024)\n", + "series_pred = model.predict(series_prompt.to_frame(), max_output_tokens=1024)" + ], + "metadata": { + "id": "6i6HkFJZa8na" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Once the predictions are processed, take a look at the sample output from the LLM, which provides code samples for the API names listed in the DataFrames dataset." + ], + "metadata": { + "id": "89cB8MW4UIdV" + } + }, + { + "cell_type": "code", + "source": [ + "print(df_pred['ml_generate_text_llm_result'].iloc[0])" + ], + "metadata": { + "id": "9A2gw6hP_2nX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Manipulate LLM output using a remote function\n", + "\n", + "The output that the LLM provides often contains additional text beyond the code sample itself. Using BigQuery DataFrames, you can deploy custom Python functions that process and transform this output.\n", + "\n" + ], + "metadata": { + "id": "Fx4lsNqMorJ-" + } + }, + { + "cell_type": "markdown", + "source": [ + "Running the cell below creates a custom function that you can use to process the LLM output data in two ways:\n", + "1. Strip the LLM text output to include only the code block.\n", + "2. Substitute `import pandas as pd` with `import bigframes.pandas as bf` so that the resulting code block works with BigQuery DataFrames." + ], + "metadata": { + "id": "d8L7SN03VByG" + } + }, + { + "cell_type": "code", + "source": [ + "@bf.remote_function([str], str, bigquery_connection=CONN_NAME)\n", + "def extract_code(text: str):\n", + " try:\n", + " res = text[text.find('\\n')+1:text.find('```', 3)]\n", + " res = res.replace(\"import pandas as pd\", \"import bigframes.pandas as bf\")\n", + " if \"import bigframes.pandas as bf\" not in res:\n", + " res = \"import bigframes.pandas as bf\\n\" + res\n", + " return res\n", + " except:\n", + " return \"\"" + ], + "metadata": { + "id": "GskyyUQPowBT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "The custom function is deployed as a Cloud Function, and then integrated with BigQuery as a [remote function](https://cloud.google.com/bigquery/docs/remote-functions). Save both of the function names so that you can clean them up at the end of this notebook." + ], + "metadata": { + "id": "hVQAoqBUOJQf" + } + }, + { + "cell_type": "code", + "source": [ + "CLOUD_FUNCTION_NAME = format(extract_code.bigframes_cloud_function)\n", + "print(\"Cloud Function Name \" + CLOUD_FUNCTION_NAME)\n", + "REMOTE_FUNCTION_NAME = format(extract_code.bigframes_remote_function)\n", + "print(\"Remote Function Name \" + REMOTE_FUNCTION_NAME)" + ], + "metadata": { + "id": "PBlp-C-DOHRO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Apply the custom function to each LLM output DataFrame to get the processed results:" + ], + "metadata": { + "id": "4FEucaiqVs3H" + } + }, + { + "cell_type": "code", + "source": [ + "df_code = df_pred.assign(code=df_pred['ml_generate_text_llm_result'].apply(extract_code))\n", + "series_code = series_pred.assign(code=series_pred['ml_generate_text_llm_result'].apply(extract_code))" + ], + "metadata": { + "id": "bsQ9cmoWo0Ps" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "You can see the differences by inspecting the first row of data:" + ], + "metadata": { + "id": "ujQVVuhfWA3y" + } + }, + { + "cell_type": "code", + "source": [ + "print(df_code['code'].iloc[0])" + ], + "metadata": { + "id": "7yWzjhGy_zcy" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Save the results to Cloud Storage\n", + "\n", + "BigQuery DataFrames lets you save a BigQuery DataFrames DataFrame as a CSV file in Cloud Storage for further use. Try that now with your processed LLM output data." + ], + "metadata": { + "id": "GTRdUw-Ro5R1" + } + }, + { + "cell_type": "markdown", + "source": [ + "Create a new Cloud Storage bucket with a unique name:" + ], + "metadata": { + "id": "9DQ7eiQxPTi3" + } + }, + { + "cell_type": "code", + "source": [ + "import uuid\n", + "BUCKET_ID = \"code-samples-\" + str(uuid.uuid1())\n", + "\n", + "!gsutil mb gs://{BUCKET_ID}" + ], + "metadata": { + "id": "-J5LHgS6LLZ0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Use `to_csv` to write each BigQuery DataFrames DataFrame as a CSV file in the Cloud Storage bucket:" + ], + "metadata": { + "id": "tyxZXj0UPYUv" + } + }, + { + "cell_type": "code", + "source": [ + "df_code[[\"code\"]].to_csv(f\"gs://{BUCKET_ID}/df_code*.csv\")\n", + "series_code[[\"code\"]].to_csv(f\"gs://{BUCKET_ID}/series_code*.csv\")" + ], + "metadata": { + "id": "Zs_b5L-4IvER" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "You can navigate to the Cloud Storage bucket browser to download the two files and view them.\n", + "\n", + "Run the following cell, and then follow the link to your Cloud Storage bucket browser:" + ], + "metadata": { + "id": "UDBtDlrTuuh8" + } + }, + { + "cell_type": "code", + "source": [ + "print(f'https://console.developers.google.com/storage/browser/%7BBUCKET_ID%7D/')" + ], + "metadata": { + "id": "PspCXu-qu_ND" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Summary and next steps\n", + "\n", + "You've used BigQuery DataFrames' integration with LLM models (`bigframes.ml.llm`) to generate code samples, and have tranformed LLM output by creating and using a custom function in BigQuery DataFrames.\n", + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." + ], + "metadata": { + "id": "RGSvUk48RK20" + } + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "source": [ + "# # Delete the BigQuery Connection\n", + "# from google.cloud import bigquery_connection_v1 as bq_connection\n", + "# client = bq_connection.ConnectionServiceClient()\n", + "# CONNECTION_ID = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/{CONN_NAME}\"\n", + "# client.delete_connection(name=CONNECTION_ID)\n", + "# print(f\"Deleted connection '{CONNECTION_ID}'.\")" + ], + "metadata": { + "id": "yw7A461XLjvW" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "# # Delete the Cloud Function\n", + "# ! gcloud functions delete {CLOUD_FUNCTION_NAME} --quiet\n", + "# # Delete the Remote Function\n", + "# REMOTE_FUNCTION_NAME = REMOTE_FUNCTION_NAME.replace(PROJECT_ID + \".\", \"\")\n", + "# ! bq rm --routine --force=true {REMOTE_FUNCTION_NAME}" + ] + }, + { + "cell_type": "code", + "source": [ + "# # Delete the Google Cloud Storage bucket and files\n", + "# ! gsutil rm -r gs://{BUCKET_ID}\n", + "# print(f\"Deleted bucket '{BUCKET_ID}'.\")" + ], + "metadata": { + "id": "iQFo6OUBLmi3" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "colab": { + "toc_visible": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb b/notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb new file mode 100644 index 0000000000..9a74beaad8 --- /dev/null +++ b/notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb @@ -0,0 +1,743 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "## Train a linear regression model with BigQuery DataFrames ML\n", + "\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24743cf4a1e1" + }, + "source": [ + "**_NOTE_**: This notebook has been tested in the following environment:\n", + "\n", + "* Python version = 3.10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Use this notebook to learn how to train a linear regression model by using BigQuery DataFrames ML. BigQuery DataFrames ML provides a provides a scikit-learn-like API for ML powered by the BigQuery engine.\n", + "\n", + "This example is adapted from the [BQML linear regression tutorial](https://cloud.google.com/bigquery-ml/docs/linear-regression-tutorial).\n", + "\n", + "Learn more about [BigQuery DataFrames](https://cloud.google.com/python/docs/reference/bigframes/latest)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you use BigQuery DataFrames to create a linear regression model that predicts the weight of an Adelie penguin based on the penguin's island of residence, culmen length and depth, flipper length, and sex.\n", + "\n", + "The steps include:\n", + "\n", + "- Creating a DataFrame from a BigQuery table.\n", + "- Cleaning and preparing data using pandas.\n", + "- Creating a linear regression model using `bigframes.ml`.\n", + "- Saving the ML model to BigQuery for future use." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the [```penguins``` table](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=ml_datasets&t=penguins) (a BigQuery Public Dataset) which includes data on a set of penguins including species, island of residence, weight, culmen length and depth, flipper length, and sex." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (compute)\n", + "* BigQuery ML\n", + "\n", + "Learn about [BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models)\n", + "and [BigQuery ML pricing](https://cloud.google.com/bigquery/pricing#bqml),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages, which are required to run this notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9O0Ka4W2MNF3" + }, + "outputs": [], + "source": [ + "!pip install bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only\n", + "\n", + "Uncomment and run the following cell to restart the kernel:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oDfTjfACBvJk" + }, + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Enable the BigQuery API](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com).\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eF-Twtc4XGem" + }, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "### Set BigQuery DataFrames options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NPPMuw2PXGeo" + }, + "outputs": [], + "source": [ + "bf.options.bigquery.project = PROJECT_ID\n", + "bf.options.bigquery.location = REGION" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D21CoOlfFTYI" + }, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.reset_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EMAqR37AfLS" + }, + "source": [ + "## Read a BigQuery table into a BigQuery DataFrames DataFrame\n", + "\n", + "Read the [```penguins``` table](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=ml_datasets&t=penguins) into a BigQuery DataFrames DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EDAaIwHpQCDZ" + }, + "outputs": [], + "source": [ + "df = bf.read_gbq(\"bigquery-public-data.ml_datasets.penguins\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DJu837YEXD7B" + }, + "source": [ + "Take a look at the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_gPD0Zn1Stdb" + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rwPLjqW2Ajzh" + }, + "source": [ + "## Clean and prepare data\n", + "\n", + "You can use pandas as you normally would on the BigQuery DataFrames DataFrame, but calculations happen in the BigQuery query engine instead of your local environment.\n", + "\n", + "Because this model will focus on the Adelie Penguin species, you need to filter the data for only those rows representing Adelie penguins. Then you drop the `species` column because it is no longer needed.\n", + "\n", + "As these functions are applied, only the new DataFrame object `adelie_data` is modified. The source table and the original DataFrame object `df` don't change." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6i6HkFJZa8na" + }, + "outputs": [], + "source": [ + "# Filter down to the data to the Adelie Penguin species\n", + "adelie_data = df[df.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", + "\n", + "# Drop the species column\n", + "adelie_data = adelie_data.drop(columns=[\"species\"])\n", + "\n", + "# Take a look at the filtered DataFrame\n", + "adelie_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jhK2OlyMbY4L" + }, + "source": [ + "Drop rows with `NULL` values in order to create a BigQuery DataFrames DataFrame for the training data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0am3hdlXZfxZ" + }, + "outputs": [], + "source": [ + "# Drop rows with nulls to get training data\n", + "training_data = adelie_data.dropna()\n", + "\n", + "# Take a peek at the training data\n", + "training_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "M_-0X7NxYK5f" + }, + "source": [ + "Specify your feature (or input) columns and the label (or output) column:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YKwCW7Nsavap" + }, + "outputs": [], + "source": [ + "feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", + "label_columns = training_data[['body_mass_g']]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CjyM7vZJZ0sQ" + }, + "source": [ + "There is a row within the `adelie_data` BigQuery DataFrames DataFrame that has a `NULL` value for the `body mass` column. `body mass` is the label column, which is the value that the model you are creating is trying to predict.\n", + "\n", + "Create a new BigQuery DataFrames DataFrame, `test_data`, for this row so that you can use it as test data on which to make a prediction later:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wej78IDUaRW9" + }, + "outputs": [], + "source": [ + "test_data = adelie_data[adelie_data.body_mass_g.isnull()]\n", + "\n", + "test_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fx4lsNqMorJ-" + }, + "source": [ + "## Create the linear regression model\n", + "\n", + "BigQuery DataFrames ML lets you move from exploring data to creating machine learning models through its scikit-learn-like API, `bigframes.ml`. BigQuery DataFrames ML supports several types of [ML models](https://cloud.google.com/python/docs/reference/bigframes/latest#ml-capabilities).\n", + "\n", + "In this notebook, you create a linear regression model, a type of regression model that generates a continuous value from a linear combination of input features.\n", + "\n", + "When you create a model with BigQuery DataFrames ML, it is saved locally and limited to the BigQuery session. However, as you'll see in the next section, you can use `to_gbq` to save the model permanently to your BigQuery project." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EloGtMnverFF" + }, + "source": [ + "### Create the model using `bigframes.ml`\n", + "\n", + "When you pass the feature columns without transforms, BigQuery ML uses\n", + "[automatic preprocessing](https://cloud.google.com/bigquery/docs/auto-preprocessing) to encode string values and scale numeric values.\n", + "\n", + "BigQuery ML also [automatically splits the data for training and evaluation](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-create-glm#data_split_method), although for datasets with less than 500 rows (such as this one), all rows are used for training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "GskyyUQPowBT" + }, + "outputs": [], + "source": [ + "from bigframes.ml.linear_model import LinearRegression\n", + "\n", + "model = LinearRegression()\n", + "\n", + "model.fit(feature_columns, label_columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UGjeMPC2caKK" + }, + "source": [ + "### Score the model\n", + "\n", + "Check how the model performed by using the `score` method. More information on model scoring can be found [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#mlevaluate_output)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kGBJKafpo0dl" + }, + "outputs": [], + "source": [ + "model.score(feature_columns, label_columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "P2lUiZZ_cjri" + }, + "source": [ + "### Predict using the model\n", + "\n", + "Use the model to predict the body mass of the data row you saved earlier to the `test_data` DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bsQ9cmoWo0Ps" + }, + "outputs": [], + "source": [ + "model.predict(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GTRdUw-Ro5R1" + }, + "source": [ + "## Save the model in BigQuery\n", + "\n", + "The model is saved locally within this session. You can save the model permanently to BigQuery for use in future sessions, and to make the model sharable with others." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "K0mPaoGpcwwy" + }, + "source": [ + "Create a BigQuery dataset to house the model, adding a name for your dataset as the `DATASET_ID` variable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZSP7gt13QrQt" + }, + "outputs": [], + "source": [ + "DATASET_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "from google.cloud import bigquery\n", + "client = bigquery.Client(project=PROJECT_ID)\n", + "dataset = bigquery.Dataset(PROJECT_ID + \".\" + DATASET_ID)\n", + "dataset.location = REGION\n", + "dataset = client.create_dataset(dataset, exists_ok=True)\n", + "print(f\"Dataset {dataset.dataset_id} created.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zqAIWWgJczp-" + }, + "source": [ + "Save the model using the `to_gbq` method:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QE_GD4Byo_jb" + }, + "outputs": [], + "source": [ + "model.to_gbq(DATASET_ID + \".penguin_weight\" , replace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "f7uHacAy49rT" + }, + "source": [ + "You can view the saved model in the BigQuery console under the dataset you created in the first step. Run the following cell and follow the link to view your BigQuery console:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qDBoiA_0488Z" + }, + "outputs": [], + "source": [ + "print(f'https://console.developers.google.com/bigquery?p={PROJECT_ID}')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G_wjSfXpWTuy" + }, + "source": [ + "# Summary and next steps\n", + "\n", + "You've created a linear regression model using `bigframes.ml`.\n", + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "## Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "# # Delete the BigQuery dataset and associated ML model\n", + "# from google.cloud import bigquery\n", + "# client = bigquery.Client(project=PROJECT_ID)\n", + "# client.delete_dataset(\n", + "# DATASET_ID, delete_contents=True, not_found_ok=True\n", + "# )\n", + "# print(\"Deleted dataset '{}'.\".format(DATASET_ID))" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/getting_started/getting_started_bq_dataframes.ipynb b/notebooks/getting_started/getting_started_bq_dataframes.ipynb new file mode 100644 index 0000000000..7815182e54 --- /dev/null +++ b/notebooks/getting_started/getting_started_bq_dataframes.ipynb @@ -0,0 +1,971 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2023 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Get started with BigQuery DataFrames\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "24743cf4a1e1" + }, + "source": [ + "**_NOTE_**: This notebook has been tested in the following environment:\n", + "\n", + "* Python version = 3.10" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "Use this notebook to get started with BigQuery DataFrames, including setup, installation, and basic tutorials.\n", + "\n", + "BigQuery DataFrames provides a Pythonic DataFrame and machine learning (ML) API powered by the BigQuery engine.\n", + "\n", + "* `bigframes.pandas` provides a pandas-like API for analytics.\n", + "* `bigframes.ml` provides a scikit-learn-like API for ML.\n", + "\n", + "Learn more about [BigQuery DataFrames](https://cloud.google.com/python/docs/reference/bigframes/latest)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d975e698c9a4" + }, + "source": [ + "### Objective\n", + "\n", + "In this tutorial, you learn how to install BigQuery DataFrames, load data into a BigQuery DataFrames DataFrame, and inspect and manipulate the data using pandas and a custom Python function, running at BigQuery scale.\n", + "\n", + "The steps include:\n", + "\n", + "- Creating a BigQuery DataFrames DataFrame: Access data from a local CSV to create a BigQuery DataFrames DataFrame.\n", + "- Inspecting and manipulating data: Use pandas to perform data cleaning and preparation on the DataFrame.\n", + "- Deploying a custom function: Deploy a [remote function ](https://cloud.google.com/bigquery/docs/remote-functions)that runs a scalar Python function at BigQuery scale." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "08d289fa873f" + }, + "source": [ + "### Dataset\n", + "\n", + "This tutorial uses the [```penguins``` table](https://console.cloud.google.com/bigquery?p=bigquery-public-data&d=ml_datasets&t=penguins) (a BigQuery public dataset), which contains data on a set of penguins including species, island of residence, weight, culmen length and depth, flipper length, and sex.\n", + "\n", + "The same dataset is also stored in a public Cloud Storage bucket as a CSV file so that you can use it to try ingesting data from a local environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "aed92deeb4a0" + }, + "source": [ + "### Costs\n", + "\n", + "This tutorial uses billable components of Google Cloud:\n", + "\n", + "* BigQuery (storage and compute)\n", + "* Cloud Functions\n", + "\n", + "Learn about [BigQuery storage pricing](https://cloud.google.com/bigquery/pricing#storage),\n", + "[BigQuery compute pricing](https://cloud.google.com/bigquery/pricing#analysis_pricing_models),\n", + "and [Cloud Functions pricing](https://cloud.google.com/functions/pricing),\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "to generate a cost estimate based on your projected usage." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i7EUnXsZhAGF" + }, + "source": [ + "## Installation\n", + "\n", + "Install the following packages, which are required to run this notebook:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mfPoOwPLGpSr" + }, + "outputs": [], + "source": [ + "!pip install bigframes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "58707a750154" + }, + "source": [ + "### Colab only\n", + "\n", + "Uncomment and run the following cell to restart the kernel:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f200f10a1da3" + }, + "outputs": [], + "source": [ + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BF1j6f9HApxa" + }, + "source": [ + "## Before you begin\n", + "\n", + "Complete the tasks in this section to set up your environment." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yq7zKYWelRQP" + }, + "source": [ + "### Set up your Google Cloud project\n", + "\n", + "**The following steps are required, regardless of your notebook environment.**\n", + "\n", + "1. [Select or create a Google Cloud project](https://console.cloud.google.com/cloud-resource-manager). When you first create an account, you get a $300 credit towards your compute/storage costs.\n", + "\n", + "2. [Make sure that billing is enabled for your project](https://cloud.google.com/billing/docs/how-to/modify-project).\n", + "\n", + "3. [Click here](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com,bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,artifactregistry.googleapis.com,cloudbuild.googleapis.com,cloudresourcemanager.googleapis.com) to enable the following APIs:\n", + "\n", + " * BigQuery API\n", + " * BigQuery Connection API\n", + " * Cloud Functions API\n", + " * Cloud Run API\n", + " * Artifact Registry API\n", + " * Cloud Build API\n", + " * Cloud Resource Manager API\n", + "\n", + "4. If you are running this notebook locally, install the [Cloud SDK](https://cloud.google.com/sdk)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WReHDGG5g0XY" + }, + "source": [ + "#### Set your project ID\n", + "\n", + "If you don't know your project ID, try the following:\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oM1iC_MfAts1" + }, + "outputs": [], + "source": [ + "PROJECT_ID = \"\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "! gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "region" + }, + "source": [ + "#### Set the region\n", + "\n", + "You can also change the `REGION` variable used by BigQuery. Learn more about [BigQuery regions](https://cloud.google.com/bigquery/docs/locations#supported_locations)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eF-Twtc4XGem" + }, + "outputs": [], + "source": [ + "REGION = \"US\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sBCra4QMA2wR" + }, + "source": [ + "### Authenticate your Google Cloud account\n", + "\n", + "Depending on your Jupyter environment, you might have to manually authenticate. Follow the relevant instructions below." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "74ccc9e52986" + }, + "source": [ + "**Vertex AI Workbench**\n", + "\n", + "Do nothing, you are already authenticated." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "de775a3773ba" + }, + "source": [ + "**Local JupyterLab instance**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "254614fa0c46" + }, + "outputs": [], + "source": [ + "# ! gcloud auth login" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ef21552ccea8" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "603adbbf0532" + }, + "outputs": [], + "source": [ + "# from google.colab import auth\n", + "# auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "960505627ddf" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PyQmSRbKA8r-" + }, + "outputs": [], + "source": [ + "import bigframes.pandas as bf" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "init_aip:mbsdk,all" + }, + "source": [ + "\n", + "### Set BigQuery DataFrames options" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NPPMuw2PXGeo" + }, + "outputs": [], + "source": [ + "bf.options.bigquery.project = PROJECT_ID\n", + "bf.options.bigquery.location = REGION" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pDfrKwMKE_dK" + }, + "source": [ + "If you want to reset the location of the created DataFrame or Series objects, reset the session by executing `bf.reset_session()`. After that, you can reuse `bf.options.bigquery.location` to specify another location." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-19Uiwoo9pP4" + }, + "source": [ + "## See the power of BigQuery DataFrames first-hand\n", + "\n", + "BigQuery DataFrames enables you to interact with datasets of any size, so that you can explore, transform, and understand even your biggest datasets using familiar tools like pandas and scikit-learn." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KMX4D2uoBwM0" + }, + "source": [ + "For example, take the BigQuery sample table `bigquery-samples.wikipedia_pageviews.200809h`, which is ~60 GB is size. This is not a dataset you'd likely be able process in pandas without extra infrastructure.\n", + "\n", + "With BigQuery DataFrames, however, computation is handled by BigQuery's highly scalable compute engine, meaning you can focus on doing data science without hitting size limitations." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i98c46p1CXoV" + }, + "source": [ + "If you'd like to try creating a BigQuery DataFrames DataFrame from this table, uncomment and run the next cell to load the table using the `read_gbq` method.\n", + "\n", + "> Note: Keep in mind that running these operations will count against your monthly [free tier allowance in BigQuery](https://cloud.google.com/bigquery/pricing#free-tier)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vyex9BQI-BNa" + }, + "outputs": [], + "source": [ + "# bq_df_sample = bf.read_gbq(\"bigquery-samples.wikipedia_pageviews.200809h\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gE6CEALjDZZV" + }, + "source": [ + "No problem! BigQuery DataFrames makes a DataFrame, `bq_df_sample`, containing the entirety of the source table of data." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "T6lAIeelDwLz" + }, + "source": [ + "Uncomment and run the following cell to see pandas in action over your new BigQuery DataFrames DataFrame.\n", + "\n", + "This code uses regex to filter the DataFrame to include only rows with Wikipedia page titles containing the word \"Google\", sums the total views by page title, and then returns the top 100 results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XfGq5apK-D_e" + }, + "outputs": [], + "source": [ + "# bq_df_sample[bq_df_sample.title.str.contains(r\"[Gg]oogle\")]\\\n", + "# .groupby(['title'], as_index=False)['views'].sum(numeric_only=True)\\\n", + "# .sort_values('views', ascending=False)\\\n", + "# .head(100)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i6XV-HTN-IFF" + }, + "source": [ + "In addition to giving you access to pandas, BigQuery DataFrames also enables you to build ML models, run inference, and deploy and run your own Python functions at scale. You'll see examples throughout this and other notebooks in this GitHub repo.\n", + "\n", + "Now you'll move to the smaller `penguins` dataset for the remainder of this getting started guide." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9EMAqR37AfLS" + }, + "source": [ + "## Create a BigQuery DataFrames DataFrame\n", + "\n", + "You can create a BigQuery DataFrames DataFrame by reading data from any of the following locations:\n", + "\n", + "* A local data file\n", + "* Data stored in a BigQuery table\n", + "* A data file stored in Cloud Storage\n", + "* An in-memory pandas DataFrame\n", + "\n", + "The following sections show how to use the first two options." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iZDjzglh9eWZ" + }, + "source": [ + "### Create a DataFrame from a local file\n", + "\n", + "Use the instructions in the following sections to create a BigQuery DataFrames DataFrame from a local file.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8Jry3NoFv3Wm" + }, + "source": [ + "#### Get the CSV file\n", + "\n", + "First, copy and paste the following link into a new browser window to download the CSV file of the penguin data to your local machine:\n", + "\n", + "> http://storage.googleapis.com/cloud-samples-data/vertex-ai/bigframe/penguins.csv\n", + "\n", + "Next, upload the local CSV file to your notebook environment, using the relevant instructions for your environment:\n", + "\n", + "**Vertex AI Workbench or a local JupyterLab instance**\n", + "\n", + "1. Follow these [directions](https://jupyterlab.readthedocs.io/en/latest/user/files.html#uploading-and-downloading) to upload the file from your machine to your notebook environment by using the UI.\n", + "2. Uncomment the next cell, set the variable `fn` to match the path to your file, and then run the cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SvyXzkRl783u" + }, + "outputs": [], + "source": [ + "# fn = 'penguins.csv'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yqcuF1JNvFse" + }, + "source": [ + "**Colab**\n", + "\n", + "Uncomment and run the following cell:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3QHQYlnoBLpt" + }, + "outputs": [], + "source": [ + "# from google.colab import files\n", + "# uploaded = files.upload()\n", + "# for fn in uploaded.keys():\n", + "# print('User uploaded file \"{name}\" with length {length} bytes'.format(\n", + "# name=fn, length=len(uploaded[fn])))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sJsrwAQY_H6g" + }, + "source": [ + "#### Create a DataFrame\n", + "\n", + "Create a BigQuery DataFrames DataFrame from the uploaded CSV file:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EDAaIwHpQCDZ" + }, + "outputs": [], + "source": [ + "df_from_local = bf.read_csv(fn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U-RVfNCu_h_h" + }, + "source": [ + "Take a look at the first few rows of the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_gPD0Zn1Stdb" + }, + "outputs": [], + "source": [ + "df_from_local.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rK0lNJmz_xkA" + }, + "source": [ + "### Ingest data from a DataFrame to a BigQuery table\n", + "\n", + "BigQuery DataFrames lets you create a BigQuery table from a BigQuery DataFrames DataFrame on-the-fly." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V1DWpmSCAEql" + }, + "source": [ + "First, create a BigQuery dataset to house the table. Choose a name for your dataset, or keep the suggestion of `birds`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZSP7gt13QrQt" + }, + "outputs": [], + "source": [ + "DATASET_ID = \"birds\" # @param {type:\"string\"}\n", + "\n", + "from google.cloud import bigquery\n", + "client = bigquery.Client(project=PROJECT_ID)\n", + "dataset = bigquery.Dataset(PROJECT_ID + \".\" + DATASET_ID)\n", + "dataset.location = REGION\n", + "dataset = client.create_dataset(dataset, exists_ok=True)\n", + "print(f\"Dataset {dataset.dataset_id} created.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Jd0dFISwAPPa" + }, + "source": [ + "Next, use the `to_gbq` method to create a BigQuery table from the DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oP1NIAmUBjop" + }, + "outputs": [], + "source": [ + "df_from_local.to_gbq(PROJECT_ID + \".\" + DATASET_ID + \".penguins\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kfF6fnmmAZEK" + }, + "source": [ + "### Create a DataFrame from BigQuery data\n", + "You can create a BigQuery DataFrames DataFrame from a BigQuery table by using the `read_gbq` method and referencing either an entire table or a SQL query." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TEy5jHJDD6hx" + }, + "source": [ + "Create a BigQuery DataFrames DataFrame from the BigQuery table you created in the previous section, and view a few rows:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IBuo-d6dWfsA" + }, + "outputs": [], + "source": [ + "query_or_table = f\"\"\"{PROJECT_ID}.{DATASET_ID}.penguins\"\"\"\n", + "bq_df = bf.read_gbq(query_or_table)\n", + "bq_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rwPLjqW2Ajzh" + }, + "source": [ + "## Inspect and manipulate data in BigQuery DataFrames" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bExmYlL_ELtV" + }, + "source": [ + "### Using pandas\n", + "\n", + "You can use pandas as you normally would on the BigQuery DataFrames DataFrame, but calculations happen in the BigQuery query engine instead of your local environment. There are 150+ pandas functions supported in BigQuery DataFrames. You can view the list in [the documentation](https://cloud.google.com/python/docs/reference/bigframes/latest)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZHFUc3Q_FHc1" + }, + "source": [ + "To see this in action, inspect one of the columns (or series) of the BigQuery DataFrames DataFrame:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6i6HkFJZa8na" + }, + "outputs": [], + "source": [ + "bq_df[\"body_mass_g\"].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "EJIZJaNXFQzh" + }, + "source": [ + "Compute the mean of this series:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "YKwCW7Nsavap" + }, + "outputs": [], + "source": [ + "average_body_mass = bq_df[\"body_mass_g\"].mean()\n", + "print(f\"average_body_mass: {average_body_mass}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DSs1cnca-MOU" + }, + "source": [ + "Calculate the mean `body_mass_g` by `species` using the `groupby` operation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4PyKMR61-Mjy" + }, + "outputs": [], + "source": [ + "bq_df[\"species\", \"body_mass_g\"].groupby(by=bq_df[\"species\"]).mean(numeric_only=True).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6sf9kZ2C9Ixe" + }, + "source": [ + "You can confirm that the calculations were run in BigQuery by clicking \"Open job\" from the previous cells' output. This takes you to the BigQuery console to view the SQL statement and job details." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cWVNZ8D_FUtT" + }, + "source": [ + "### Using custom functions\n", + "\n", + "Running your own Python functions (or being able to bring your packages) and using them at scale is a challenge many data scientists face. BigQuery DataFrames makes it easy to deploy [remote functions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) that run scalar Python functions at BigQuery scale. These functions are persisted as [BigQuery remote functions](https://cloud.google.com/bigquery/docs/remote-functions) that you can then re-use." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zjw8toUbHuRD" + }, + "source": [ + "Running the cell below creates a custom function using the `remote_function` method. This function categorizes a value into one of two buckets: >= 4000 or <4000.\n", + "\n", + "> Note: Creating a function requires a [BigQuery connection](https://cloud.google.com/bigquery/docs/remote-functions#create_a_remote_function). This code assumes a pre-created connection named `bigframes-rf-conn`. If\n", + "the connection is not already created, BigQuery DataFrames attempts to create one assuming the [necessary APIs\n", + "and IAM permissions](https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_remote_function) are set up in the project.\n", + "\n", + "This cell takes a few minutes to run because it creates the BigQuery connection (if applicable) and deploys the Cloud Function." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rSWTOG-vb2Fc" + }, + "outputs": [], + "source": [ + "@bf.remote_function([float], str, bigquery_connection='bigframes-rf-conn')\n", + "def get_bucket(num):\n", + " if not num: return \"NA\"\n", + " boundary = 4000\n", + " return \"at_or_above_4000\" if num >= boundary else \"below_4000\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N7JH0BI5IOpK" + }, + "source": [ + "The custom function is deployed as a Cloud Function, and is then integrated with BigQuery as a remote function.\n", + "\n", + "Save both of the function names so that you can clean them up at the end of this notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6ejPXoyEQpWE" + }, + "outputs": [], + "source": [ + "CLOUD_FUNCTION_NAME = format(get_bucket.bigframes_cloud_function)\n", + "print(\"Cloud Function Name \" + CLOUD_FUNCTION_NAME)\n", + "REMOTE_FUNCTION_NAME = format(get_bucket.bigframes_remote_function)\n", + "print(\"Remote Function Name \" + REMOTE_FUNCTION_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vHV3JqKjJHsH" + }, + "source": [ + "Apply the custom function to the BigQuery DataFrames DataFrame to bucketize the `body_mass_g` value of the penguins:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NxSd9WZFcIji" + }, + "outputs": [], + "source": [ + "bq_df = bq_df.assign(body_mass_bucket=bq_df['body_mass_g'].apply(get_bucket))\n", + "bq_df[['body_mass_g', 'body_mass_bucket']].head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wCsmt0IwFkDy" + }, + "source": [ + "## Summary and next steps\n", + "\n", + "You've created BigQuery DataFrames DataFrames, and inspected and manipulated data with pandas and custom remote functions at BigQuery scale and speed.\n", + "\n", + "Learn more about BigQuery DataFrames in the [documentation](https://cloud.google.com/python/docs/reference/bigframes/latest) and find more sample notebooks in the [GitHub repo](https://github.com/googleapis/python-bigquery-dataframes/tree/main/notebooks), including an introductory notebook for `bigframes.ml`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TpV-iwP9qw9c" + }, + "source": [ + "### Cleaning up\n", + "\n", + "To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud\n", + "project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.\n", + "\n", + "Otherwise, you can uncomment the remaining cells and run them to delete the individual resources you created in this tutorial:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "sx_vKniMq9ZX" + }, + "outputs": [], + "source": [ + "# # Delete the BigQuery dataset\n", + "# from google.cloud import bigquery\n", + "# client = bigquery.Client(project=PROJECT_ID)\n", + "# client.delete_dataset(\n", + "# DATASET_ID, delete_contents=True, not_found_ok=True\n", + "# )\n", + "# print(\"Deleted dataset '{}'.\".format(DATASET_ID))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_dTCXvCxtPw9" + }, + "outputs": [], + "source": [ + "# # Delete the BigQuery Connection\n", + "# from google.cloud import bigquery_connection_v1 as bq_connection\n", + "# client = bq_connection.ConnectionServiceClient()\n", + "# CONNECTION_ID = f\"projects/{PROJECT_ID}/locations/{REGION}/connections/bigframes-rf-conn\"\n", + "# client.delete_connection(name=CONNECTION_ID)\n", + "# print(\"Deleted connection '{}'.\".format(CONNECTION_ID))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EDAIIfcpwNOF" + }, + "outputs": [], + "source": [ + "# # Delete the Cloud Function\n", + "# ! gcloud functions delete {CLOUD_FUNCTION_NAME} --quiet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QwumLUKmVpuH" + }, + "outputs": [], + "source": [ + "# # Delete the Remote Function\n", + "# REMOTE_FUNCTION_NAME = REMOTE_FUNCTION_NAME.replace(PROJECT_ID + \".\", \"\")\n", + "# ! bq rm --routine --force=true {REMOTE_FUNCTION_NAME}" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/03 - Using ML - ML fundamentals.ipynb b/notebooks/getting_started/ml_fundamentals.ipynb similarity index 100% rename from notebooks/03 - Using ML - ML fundamentals.ipynb rename to notebooks/getting_started/ml_fundamentals.ipynb diff --git a/notebooks/10 - Regionalized.ipynb b/notebooks/location/regionalized.ipynb similarity index 100% rename from notebooks/10 - Regionalized.ipynb rename to notebooks/location/regionalized.ipynb diff --git a/notebooks/05 - Using ML - Easy linear regression.ipynb b/notebooks/regression/easy_linear_regression.ipynb similarity index 100% rename from notebooks/05 - Using ML - Easy linear regression.ipynb rename to notebooks/regression/easy_linear_regression.ipynb diff --git a/notebooks/04 - Using ML - SKLearn linear regression.ipynb b/notebooks/regression/sklearn_linear_regression.ipynb similarity index 100% rename from notebooks/04 - Using ML - SKLearn linear regression.ipynb rename to notebooks/regression/sklearn_linear_regression.ipynb diff --git a/notebooks/50 - Remote Function.ipynb b/notebooks/remote_functions/remote_function.ipynb similarity index 100% rename from notebooks/50 - Remote Function.ipynb rename to notebooks/remote_functions/remote_function.ipynb diff --git a/noxfile.py b/noxfile.py index 7d4cb1c61b..ec5a1b1651 100644 --- a/noxfile.py +++ b/noxfile.py @@ -19,6 +19,7 @@ from multiprocessing import Process import os import pathlib +from pathlib import Path import re import shutil from typing import Dict, List @@ -30,6 +31,10 @@ BLACK_VERSION = "black==22.3.0" ISORT_VERSION = "isort==5.12.0" SPHINX_VERSION = "sphinx==4.5.0" + +# pytest-retry 1.4.0 fails on Python 3.11. +# https://github.com/str0zzapreti/pytest-retry/issues/17 +PYTEST_RETRY_VERSION = "pytest-retry<=1.3.0" LINT_PATHS = ["docs", "bigframes", "tests", "noxfile.py", "setup.py"] DEFAULT_PYTHON_VERSION = "3.10" @@ -55,7 +60,7 @@ "openpyxl", "pytest", "pytest-cov", - "pytest-retry", + PYTEST_RETRY_VERSION, "pytest-timeout", "pytest-xdist", "google-cloud-testutils", @@ -590,23 +595,37 @@ def system_prerelease(session: nox.sessions.Session): @nox.session(python=SYSTEM_TEST_PYTHON_VERSIONS) def notebook(session): session.install("-e", ".[all]") - session.install("pytest", "pytest-xdist", "pytest-retry", "nbmake") - - notebooks = [ - "00 - Summary.ipynb", - "01 - Getting Started.ipynb", - "02 - DataFrame.ipynb", - "03 - Using ML - ML fundamentals.ipynb", - "04 - Using ML - SKLearn linear regression.ipynb", - "05 - Using ML - Easy linear regression.ipynb", - "06 - Using ML - Large Language Models.ipynb", - "50 - Remote Function.ipynb", + session.install("pytest", "pytest-xdist", PYTEST_RETRY_VERSION, "nbmake") + + notebooks_list = list(Path("notebooks/").glob("*/*.ipynb")) + + denylist = [ + # Regionalized testing is manually added later. + "notebooks/location/regionalized.ipynb", + # These notebooks contain special colab `param {type:"string"}` + # comments, which make it easy for customers to fill in their + # own information. + # TODO(ashleyxu): Test these notebooks by replacing parameters with + # appropriate values and omitting cleanup logic that may break + # our test infrastructure. + "notebooks/getting_started/getting_started_bq_dataframes.ipynb", + "notebooks/getting_started/bq_dataframes_llm_code_generation.ipynb", + "notebooks/getting_started/bq_dataframes_ml_linear_regression.ipynb", + "notebooks/generative_ai/bq_dataframes_ml_drug_name_generation.ipynb", + # The experimental notebooks imagine features that don't yet + # exist or only exist as temporary prototypes. + "notebooks/experimental/longer_ml_demo.ipynb", ] - notebooks = [os.path.join("notebooks", nb) for nb in notebooks] + + # Convert each Path notebook object to a string using a list comprehension. + notebooks = [str(nb) for nb in notebooks_list] + + # Remove tests that we choose not to test. + notebooks = list(filter(lambda nb: nb not in denylist, notebooks)) # Regionalized notebooks notebooks_reg = { - "10 - Regionalized.ipynb": [ + "regionalized.ipynb": [ "asia-southeast1", "eu", "europe-west4", @@ -616,7 +635,8 @@ def notebook(session): ] } notebooks_reg = { - os.path.join("notebooks", nb): regions for nb, regions in notebooks_reg.items() + os.path.join("notebooks/location", nb): regions + for nb, regions in notebooks_reg.items() } # For some reason nbmake exits silently with "no tests ran" message if diff --git a/pytest.ini b/pytest.ini index 693439f47c..204c743bbf 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,4 @@ [pytest] +doctest_optionflags = NORMALIZE_WHITESPACE filterwarnings = ignore::pandas.errors.SettingWithCopyWarning diff --git a/samples/snippets/remote_function.py b/samples/snippets/remote_function.py new file mode 100644 index 0000000000..37972672c3 --- /dev/null +++ b/samples/snippets/remote_function.py @@ -0,0 +1,147 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def run_remote_function_and_read_gbq_function(project_id: str): + your_gcp_project_id = project_id + + # [START bigquery_dataframes_remote_function] + import bigframes.pandas as bpd + + # Set BigQuery DataFrames options + bpd.options.bigquery.project = your_gcp_project_id + bpd.options.bigquery.location = "us" + + # BigQuery DataFrames gives you the ability to turn your custom scalar + # functions into a BigQuery remote function. It requires the GCP project to + # be set up appropriately and the user having sufficient privileges to use + # them. One can find more details about the usage and the requirements via + # `help` command. + help(bpd.remote_function) + + # Read a table and inspect the column of interest. + df = bpd.read_gbq("bigquery-public-data.ml_datasets.penguins") + df["body_mass_g"].head(10) + + # Define a custom function, and specify the intent to turn it into a remote + # function. It requires a BigQuery connection. If the connection is not + # already created, BigQuery DataFrames will attempt to create one assuming + # the necessary APIs and IAM permissions are setup in the project. In our + # examples we would be using a pre-created connection named + # `bigframes-rf-conn`. Let's try a `pandas`-like use case in which we want + # to apply a user defined scalar function to every value in a `Series`, more + # specifically bucketize the `body_mass_g` value of the penguins, which is a + # real number, into a category, which is a string. + @bpd.remote_function([float], str, bigquery_connection="bigframes-rf-conn") + def get_bucket(num): + if not num: + return "NA" + boundary = 4000 + return "at_or_above_4000" if num >= boundary else "below_4000" + + # Then we can apply the remote function on the `Series`` of interest via + # `apply` API and store the result in a new column in the DataFrame. + df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket)) + + # This will add a new column `body_mass_bucket` in the DataFrame. You can + # preview the original value and the bucketized value side by side. + df[["body_mass_g", "body_mass_bucket"]].head(10) + + # The above operation was possible by doing all the computation on the + # cloud. For that, there is a google cloud function deployed by serializing + # the user code, and a BigQuery remote function created to call the cloud + # function via the latter's http endpoint on the data in the DataFrame. + + # The BigQuery remote function created to support the BigQuery DataFrames + # remote function can be located via a property `bigframes_remote_function` + # set in the remote function object. + print(f"Created BQ remote function: {get_bucket.bigframes_remote_function}") + + # The cloud function can be located via another property + # `bigframes_cloud_function` set in the remote function object. + print(f"Created cloud function: {get_bucket.bigframes_cloud_function}") + + # Warning: The deployed cloud function may be visible to other users with + # sufficient privilege in the project, so the user should be careful about + # having any sensitive data in the code that will be deployed as a remote + # function. + + # Let's continue trying other potential use cases of remote functions. Let's + # say we consider the `species`, `island` and `sex` of the penguins + # sensitive information and want to redact that by replacing with their hash + # code instead. Let's define another scalar custom function and decorated it + # as a remote function + @bpd.remote_function([str], str, bigquery_connection="bigframes-rf-conn") + def get_hash(input): + import hashlib + + # handle missing value + if input is None: + input = "" + encoded_input = input.encode() + hash = hashlib.md5(encoded_input) + return hash.hexdigest() + + # We can use this remote function in another `pandas`-like API `map` that + # can be applied on a DataFrame + df_redacted = df[["species", "island", "sex"]].map(get_hash) + df_redacted.head(10) + + # [END bigquery_dataframes_remote_function] + + existing_get_bucket_bq_udf = get_bucket.bigframes_remote_function + + # [START bigquery_dataframes_read_gbq_function] + + # If you have already defined a custom function in BigQuery, either via the + # BigQuery Google Cloud Console or with the `remote_function` decorator, + # or otherwise, you may use it with BigQuery DataFrames with the + # `read_gbq_function` method. More details are available via the `help` + # command. + import bigframes.pandas as pd + + help(pd.read_gbq_function) + + # Here is an example of using `read_gbq_function` to load an existing + # BigQuery function. + df = pd.read_gbq("bigquery-public-data.ml_datasets.penguins") + get_bucket_function = pd.read_gbq_function(existing_get_bucket_bq_udf) + + df = df.assign(body_mass_bucket=df["body_mass_g"].apply(get_bucket_function)) + df.head(10) + + # It should be noted that if a function is created using the + # `remote_function` decorator, its created BQ remote function is accessible + # immediately afterward via the function's `bigframes_remote_function` + # attribute. The same string can be passed to `read_gbq_function` later in + # another context. + + # [END bigquery_dataframes_read_gbq_function] + + # Clean up cloud artifacts + session = bpd.get_global_session() + for function in (get_bucket, get_hash): + try: + session.bqclient.delete_routine(function.bigframes_remote_function) + except Exception: + # Ignore exception during clean-up + pass + + try: + session.cloudfunctionsclient.delete_function( + name=function.bigframes_cloud_function + ) + except Exception: + # Ignore exception during clean-up + pass diff --git a/samples/snippets/remote_function_test.py b/samples/snippets/remote_function_test.py new file mode 100644 index 0000000000..8b51e46b45 --- /dev/null +++ b/samples/snippets/remote_function_test.py @@ -0,0 +1,32 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import bigframes.pandas + +from . import remote_function + + +def test_remote_function_and_read_gbq_function( + capsys: pytest.CaptureFixture[str], +) -> None: + # We need a fresh session since we're modifying connection options. + bigframes.pandas.reset_session() + + # TODO(swast): Get project from environment so contributors can run tests. + remote_function.run_remote_function_and_read_gbq_function("bigframes-dev") + out, _ = capsys.readouterr() + assert "Created BQ remote function:" in out + assert "Created cloud function:" in out diff --git a/tests/data/hockey_players.json b/tests/data/hockey_players.json new file mode 100644 index 0000000000..8a9b252992 --- /dev/null +++ b/tests/data/hockey_players.json @@ -0,0 +1,37 @@ +[ + { + "mode": "NULLABLE", + "name": "team_name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "position", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "player_name", + "type": "STRING" + }, + { + "mode": "NULLABLE", + "name": "goals", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "assists", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "number", + "type": "INTEGER" + }, + { + "mode": "NULLABLE", + "name": "season", + "type": "INTEGER" + } +] diff --git a/tests/data/hockey_players.jsonl b/tests/data/hockey_players.jsonl new file mode 100644 index 0000000000..d2b26cffdd --- /dev/null +++ b/tests/data/hockey_players.jsonl @@ -0,0 +1,10 @@ +{"team_name":"Canucks", "position":"C", "player_name":"Elias Petterson", "goals":39, "assists":63, "number":40, "season":2023} +{"team_name":"Canucks", "position":"LW", "player_name":"Ilya Mikheyev", "goals":13, "assists":15, "number":65, "season":2023} +{"team_name":"Canucks", "position":"RW", "player_name":"Andrei Kuzmenko", "goals":39, "assists":35, "number":40, "season":2023} +{"team_name":"Kraken", "position":"C", "player_name":"Jared McCann", "goals":40, "assists":30, "number":19, "season":2023} +{"team_name":"Kraken", "position":"LW", "player_name":"Yanni Gourde", "goals":14, "assists":34, "number":37, "season":2023} +{"team_name":"Kraken", "position":"RW", "player_name":"Jordan Eberle", "goals":20, "assists":43, "number":7, "season":2023} +{"team_name":"Canucks", "position":"C", "player_name":"Elias Petterson", "goals":32, "assists":36, "number":40, "season":2022} +{"team_name":"Kraken", "position":"C", "player_name":"Jared McCann", "goals":27, "assists":23, "number":19, "season":2022} +{"team_name":"Kraken", "position":"LW", "player_name":"Yanni Gourde", "goals":21, "assists":27, "number":37, "season":2022} +{"team_name":"Kraken", "position":"RW", "player_name":"Jordan Eberle", "goals":21, "assists":23, "number":7, "season":2022} diff --git a/tests/system/conftest.py b/tests/system/conftest.py index 41c8eaffd7..bf5cf12c74 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -211,6 +211,7 @@ def load_test_data_tables( ("scalars_too", "scalars_schema.json", "scalars.jsonl"), ("penguins", "penguins_schema.json", "penguins.jsonl"), ("time_series", "time_series_schema.json", "time_series.jsonl"), + ("hockey_players", "hockey_players.json", "hockey_players.jsonl"), ]: test_data_hash = hashlib.md5() _hash_digest_file(test_data_hash, DATA_DIR / schema_filename) @@ -255,6 +256,11 @@ def scalars_table_id(test_data_tables) -> str: return test_data_tables["scalars"] +@pytest.fixture(scope="session") +def hockey_table_id(test_data_tables) -> str: + return test_data_tables["hockey_players"] + + @pytest.fixture(scope="session") def scalars_table_id_2(test_data_tables) -> str: return test_data_tables["scalars_too"] @@ -354,6 +360,34 @@ def scalars_dfs( return scalars_df_index, scalars_pandas_df_index +@pytest.fixture(scope="session") +def hockey_df( + hockey_table_id: str, session: bigframes.Session +) -> bigframes.dataframe.DataFrame: + """DataFrame pointing at test data.""" + return session.read_gbq(hockey_table_id) + + +@pytest.fixture(scope="session") +def hockey_pandas_df() -> pd.DataFrame: + """pd.DataFrame pointing at test data.""" + df = pd.read_json( + DATA_DIR / "hockey_players.jsonl", + lines=True, + dtype={ + "team_name": pd.StringDtype(storage="pyarrow"), + "position": pd.StringDtype(storage="pyarrow"), + "player_name": pd.StringDtype(storage="pyarrow"), + "goals": pd.Int64Dtype(), + "assists": pd.Int64Dtype(), + "number": pd.Int64Dtype(), + "season": pd.Int64Dtype(), + }, + ) + df.index = df.index.astype("Int64") + return df + + @pytest.fixture(scope="session") def penguins_df_default_index( penguins_table_id: str, session: bigframes.Session @@ -721,3 +755,48 @@ def restore_sampling_settings(): yield bigframes.options.sampling.enable_downsampling = enable_downsampling bigframes.options.sampling.max_download_size = max_download_size + + +@pytest.fixture() +def weird_strings_pd(): + df = pd.DataFrame( + { + "string_col": [ + "٠١٢٣٤٥٦٧٨٩", + "", + "0", + "字", + "五", + "0123456789", + pd.NA, + "abc 123 mixed letters and numbers", + "no numbers here", + "123a", + "23!", + " 45", + "a45", + "Dž", + "tT", + "-123", + "-123.4", + "-0", + "-.0", + ".0", + ".1", + "⅙", + "²", + "\t", + "a\ta", + "p1\np2", + " ", + ] + }, + dtype=pd.StringDtype(storage="pyarrow"), + ) + df.index = df.index.astype("Int64") + return df.string_col + + +@pytest.fixture() +def weird_strings(session, weird_strings_pd): + return session.read_pandas(weird_strings_pd.to_frame()).string_col diff --git a/tests/system/large/ml/test_compose.py b/tests/system/large/ml/test_compose.py index 0c2744819d..b65baa63eb 100644 --- a/tests/system/large/ml/test_compose.py +++ b/tests/system/large/ml/test_compose.py @@ -21,7 +21,7 @@ import bigframes.ml.preprocessing -def test_columntransformer_standalone_fit_transform( +def test_columntransformer_standalone_fit_and_transform( penguins_df_default_index, new_penguins_df ): transformer = bigframes.ml.compose.ColumnTransformer( @@ -73,3 +73,54 @@ def test_columntransformer_standalone_fit_transform( ) pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) + + +def test_columntransformer_standalone_fit_transform(new_penguins_df): + transformer = bigframes.ml.compose.ColumnTransformer( + [ + ( + "onehot", + bigframes.ml.preprocessing.OneHotEncoder(), + "species", + ), + ( + "scale", + bigframes.ml.preprocessing.StandardScaler(), + ["culmen_length_mm", "flipper_length_mm"], + ), + ] + ) + + result = transformer.fit_transform( + new_penguins_df[["species", "culmen_length_mm", "flipper_length_mm"]] + ).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pandas.DataFrame( + { + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + "scaled_culmen_length_mm": [ + 1.313249, + -0.20198, + -1.111118, + ], + "scaled_flipper_length_mm": [1.251098, -1.196588, -0.054338], + }, + index=pandas.Index([1633, 1672, 1690], dtype="Int64", name="tag_number"), + ) + expected.scaled_culmen_length_mm = expected.scaled_culmen_length_mm.astype( + "Float64" + ) + expected.scaled_flipper_length_mm = expected.scaled_flipper_length_mm.astype( + "Float64" + ) + + pandas.testing.assert_frame_equal(result, expected, rtol=1e-3) diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 74356c81e1..181678ebcb 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -15,6 +15,7 @@ from unittest import TestCase import numpy as np +import pytest def test_create_text_generator_model(palm2_text_generator_model): @@ -22,6 +23,8 @@ def test_create_text_generator_model(palm2_text_generator_model): assert palm2_text_generator_model is not None +# Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. +@pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -32,6 +35,7 @@ def test_text_generator_predict_default_params_success( assert all(series.str.len() > 20) +@pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_series_default_params_success( palm2_text_generator_model, llm_text_df ): @@ -42,6 +46,7 @@ def test_text_generator_predict_series_default_params_success( assert all(series.str.len() > 20) +@pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_arbitrary_col_label_success( palm2_text_generator_model, llm_text_df ): @@ -53,6 +58,7 @@ def test_text_generator_predict_arbitrary_col_label_success( assert all(series.str.len() > 20) +@pytest.mark.flaky(retries=2, delay=120) def test_text_generator_predict_with_params_success( palm2_text_generator_model, llm_text_df ): @@ -70,6 +76,7 @@ def test_create_embedding_generator_model(palm2_embedding_generator_model): assert palm2_embedding_generator_model is not None +@pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_success( palm2_embedding_generator_model, llm_text_df ): @@ -82,6 +89,7 @@ def test_embedding_generator_predict_success( assert value.size == 768 +@pytest.mark.flaky(retries=2, delay=120) def test_embedding_generator_predict_series_success( palm2_embedding_generator_model, llm_text_df ): diff --git a/tests/system/small/ml/test_preprocessing.py b/tests/system/small/ml/test_preprocessing.py index 420a80754f..57b9900c48 100644 --- a/tests/system/small/ml/test_preprocessing.py +++ b/tests/system/small/ml/test_preprocessing.py @@ -24,13 +24,13 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): scaler = bigframes.ml.preprocessing.StandardScaler() scaler.fit( penguins_df_default_index[ - "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm" + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] ] ) result = scaler.transform( penguins_df_default_index[ - "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm" + ["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"] ] ).to_pandas() @@ -58,6 +58,35 @@ def test_standard_scaler_normalizes(penguins_df_default_index, new_penguins_df): pd.testing.assert_frame_equal(result, expected, rtol=1e-3) +def test_standard_scaler_normalizeds_fit_transform(new_penguins_df): + # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. + scaler = bigframes.ml.preprocessing.StandardScaler() + result = scaler.fit_transform( + new_penguins_df[["culmen_length_mm", "culmen_depth_mm", "flipper_length_mm"]] + ).to_pandas() + + # If standard-scaled correctly, mean should be 0.0 + for column in result.columns: + assert math.isclose(result[column].mean(), 0.0, abs_tol=1e-3) + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "scaled_culmen_depth_mm": [1.17072, -1.272416, 0.101848], + "scaled_culmen_length_mm": [1.313249, -0.20198, -1.111118], + "scaled_flipper_length_mm": [1.251089, -1.196588, -0.054338], + }, + dtype="Float64", + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected, rtol=1e-3) + + def test_standard_scaler_series_normalizes(penguins_df_default_index, new_penguins_df): # TODO(http://b/292431644): add a second test that compares output to sklearn.preprocessing.StandardScaler, when BQML's change is in prod. scaler = bigframes.ml.preprocessing.StandardScaler() @@ -93,7 +122,7 @@ def test_standard_scaler_series_normalizes(penguins_df_default_index, new_pengui def test_one_hot_encoder_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() - encoder.fit(new_penguins_df["species", "sex"]) + encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -121,6 +150,35 @@ def test_one_hot_encoder_default_params(new_penguins_df): pd.testing.assert_frame_equal(result, expected) +def test_one_hot_encoder_default_params_fit_transform(new_penguins_df): + encoder = bigframes.ml.preprocessing.OneHotEncoder() + + result = encoder.fit_transform(new_penguins_df[["species", "sex"]]).to_pandas() + + # TODO: bug? feature columns seem to be in nondeterministic random order + # workaround: sort columns by name. Can't repro it in pantheon, so could + # be a bigframes issue... + result = result.reindex(sorted(result.columns), axis=1) + + expected = pd.DataFrame( + { + "onehotencoded_sex": [ + [{"index": 2, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + ], + "onehotencoded_species": [ + [{"index": 1, "value": 1.0}], + [{"index": 1, "value": 1.0}], + [{"index": 2, "value": 1.0}], + ], + }, + index=pd.Index([1633, 1672, 1690], name="tag_number", dtype="Int64"), + ) + + pd.testing.assert_frame_equal(result, expected) + + def test_one_hot_encoder_series_default_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() encoder.fit(new_penguins_df["species"]) @@ -148,7 +206,7 @@ def test_one_hot_encoder_series_default_params(new_penguins_df): def test_one_hot_encoder_params(new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder("most_frequent", 100, 2) - encoder.fit(new_penguins_df["species", "sex"]) + encoder.fit(new_penguins_df[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() @@ -178,7 +236,7 @@ def test_one_hot_encoder_params(new_penguins_df): def test_one_hot_encoder_different_data(penguins_df_default_index, new_penguins_df): encoder = bigframes.ml.preprocessing.OneHotEncoder() - encoder.fit(penguins_df_default_index["species", "sex"]) + encoder.fit(penguins_df_default_index[["species", "sex"]]) result = encoder.transform(new_penguins_df).to_pandas() diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index 31b64f4314..241cbd576b 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -254,31 +254,93 @@ def test_upper(scalars_dfs): ) -def test_isnumeric(session): - pandas_df = pd.DataFrame( - { - "numeric_string_col": [ - "٠١٢٣٤٥٦٧٨٩", - "", - "0", - "字", - "五", - "0123456789", - pd.NA, - "abc 123 mixed letters and numbers", - "no numbers here", - "123a", - "23!", - " 45", - "a45", - ] - } - ) - - df = session.read_pandas(pandas_df) - - pd_result = pandas_df.numeric_string_col.str.isnumeric() - bf_result = df.numeric_string_col.str.isnumeric().to_pandas() +def test_isnumeric(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isnumeric() + bf_result = weird_strings.str.isnumeric().to_pandas() + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_isalpha(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isalpha() + bf_result = weird_strings.str.isalpha().to_pandas() + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_isdigit(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isdigit() + bf_result = weird_strings.str.isdigit().to_pandas() + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_isdecimal(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isdecimal() + bf_result = weird_strings.str.isdecimal().to_pandas() + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_isalnum(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isalnum() + bf_result = weird_strings.str.isalnum().to_pandas() + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_isspace(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isspace() + bf_result = weird_strings.str.isspace().to_pandas() + + pd.testing.assert_series_equal( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_islower(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.islower() + bf_result = weird_strings.str.islower().to_pandas() + + assert_series_equal_ignoring_order( + bf_result, + pd_result.astype(pd.BooleanDtype()) + # the dtype here is a case of intentional diversion from pandas + # see go/bigframes-dtypes + ) + + +def test_isupper(weird_strings, weird_strings_pd): + pd_result = weird_strings_pd.str.isupper() + bf_result = weird_strings.str.isupper().to_pandas() assert_series_equal_ignoring_order( bf_result, @@ -394,9 +456,6 @@ def test_str_get(scalars_dfs): bf_result = bf_series.str.get(8).to_pandas() pd_result = scalars_pandas_df[col_name].str.get(8) - print(pd_result) - print(bf_result) - assert_series_equal_ignoring_order( pd_result, bf_result, @@ -416,6 +475,16 @@ def test_str_pad(scalars_dfs): ) +def test_str_zfill(weird_strings, weird_strings_pd): + bf_result = weird_strings.str.zfill(5).to_pandas() + pd_result = weird_strings_pd.str.zfill(5) + + pd.testing.assert_series_equal( + pd_result, + bf_result, + ) + + def test_str_ljust(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "string_col" diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 2c44dd8067..5b4f9ebccc 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -840,6 +840,14 @@ def test_set_index(scalars_dfs, index_column, drop, append): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_set_index_key_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + with pytest.raises(KeyError): + scalars_pandas_df.set_index(["not_a_col"]) + with pytest.raises(KeyError): + scalars_df.set_index(["not_a_col"]) + + @pytest.mark.parametrize( ("ascending",), ((True,), (False,)), @@ -1321,6 +1329,41 @@ def test_df_describe(scalars_dfs): ).all() +@pytest.mark.parametrize( + ("values", "index", "columns"), + [ + ("int64_col", "int64_too", ["string_col"]), + (["int64_col"], "int64_too", ["string_col"]), + (["int64_col", "float64_col"], "int64_too", ["string_col"]), + ], +) +def test_df_pivot(scalars_dfs, values, index, columns): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.pivot( + values=values, index=index, columns=columns + ).to_pandas() + pd_result = scalars_pandas_df.pivot(values=values, index=index, columns=columns) + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("values", "index", "columns"), + [ + (["goals", "assists"], ["team_name", "season"], ["position"]), + (["goals", "assists"], ["season"], ["team_name", "position"]), + ], +) +def test_df_pivot_hockey(hockey_df, hockey_pandas_df, values, index, columns): + bf_result = hockey_df.pivot(values=values, index=index, columns=columns).to_pandas() + pd_result = hockey_pandas_df.pivot(values=values, index=index, columns=columns) + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + def test_ipython_key_completions_with_drop(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_names = "string_col" @@ -1621,6 +1664,7 @@ def test_sample_raises_value_error(scalars_dfs): @pytest.mark.parametrize( ("axis",), [ + (None,), (0,), (1,), ], diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index d5dd4e357b..5a2562bfb2 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -109,8 +109,6 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): ) bf_result_computed = bf_result.to_pandas() - # Pandas produces multi-index which isn't supported in bq df yet - pd_result = pd_result.set_axis(bf_result.columns, axis=1) pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) @@ -128,8 +126,6 @@ def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index): ) bf_result_computed = bf_result.to_pandas() - # Pandas produces multi-index which isn't supported in bq df yet - pd_result = pd_result.set_axis(bf_result.columns, axis=1) pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) diff --git a/tests/system/small/test_ipython.py b/tests/system/small/test_ipython.py new file mode 100644 index 0000000000..6725805d9a --- /dev/null +++ b/tests/system/small/test_ipython.py @@ -0,0 +1,28 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +IPython = pytest.importorskip("IPython") + + +def test_repr_cache(scalars_df_index): + display_formatter = IPython.core.formatters.DisplayFormatter() + # Make sure the df has a new block that the method return value + # is not already cached. + test_df = scalars_df_index.head() + results = display_formatter.format(test_df) + assert results[0].keys() == {"text/plain", "text/html"} + assert test_df._block.retrieve_repr_request_results.cache_info().misses == 1 + assert test_df._block.retrieve_repr_request_results.cache_info().hits == 1 diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 914be6dae4..b2937d7da9 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -19,6 +19,7 @@ from tests.system.utils import assert_pandas_df_equal_ignore_ordering +# Row Multi-index tests def test_set_multi_index(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.set_index(["bool_col", "int64_too"]).to_pandas() pd_result = scalars_pandas_df_index.set_index(["bool_col", "int64_too"]) @@ -443,3 +444,179 @@ def test_multi_index_series_rename_dict_same_type( pandas.testing.assert_series_equal( bf_result, pd_result, check_dtype=False, check_index_type=False ) + + +# Column Multi-index tests + + +def test_column_multi_index_getitem(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "string_col", "bool_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], columns)) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_a = bf_df["a"].to_pandas() + pd_a = pd_df["a"] + pandas.testing.assert_frame_equal(bf_a, pd_a) + + bf_b = bf_df["b"].to_pandas() + pd_b = pd_df["b"] + pandas.testing.assert_frame_equal(bf_b, pd_b) + + bf_fullkey = bf_df[("a", "int64_too")].to_pandas() + pd_fullkey = pd_df[("a", "int64_too")] + pandas.testing.assert_series_equal(bf_fullkey, pd_fullkey) + + +def test_column_multi_index_concat(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "string_col", "bool_col", "int64_col"] + multi_columns1 = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a", "b"], [1, 1, 2, 2]) + ) + multi_columns2 = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a", "c"], [3, 1, 2, 1]) + ) + + bf_df1 = scalars_df_index[columns].copy() + bf_df1.columns = multi_columns1 + bf_df2 = scalars_df_index[columns].copy() + bf_df2.columns = multi_columns2 + + pd_df1 = scalars_pandas_df_index[columns].copy() + pd_df1.columns = multi_columns1 + pd_df2 = scalars_pandas_df_index[columns].copy() + pd_df2.columns = multi_columns2 + + bf_result = bpd.concat([bf_df1, bf_df2, bf_df1]).to_pandas() + pd_result = pandas.concat([pd_df1, pd_df2, pd_df1]) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_column_multi_index_drop(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "string_col", "bool_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], columns)) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_a = bf_df.drop(("a", "int64_too"), axis=1).to_pandas() + pd_a = pd_df.drop(("a", "int64_too"), axis=1) + pandas.testing.assert_frame_equal(bf_a, pd_a) + + +@pytest.mark.parametrize( + ("key",), + [ + ("a",), + ("b",), + ("c",), + ], +) +def test_column_multi_index_assign(scalars_df_index, scalars_pandas_df_index, key): + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], columns)) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + kwargs = {key: 42} + bf_result = bf_df.assign(**kwargs).to_pandas() + pd_result = pd_df.assign(**kwargs) + + # Pandas assign results in non-nullable dtype + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.rename(columns={"b": "c"}).to_pandas() + pd_result = pd_df.rename(columns={"b": "c"}) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_column_multi_index_reset_index(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.reset_index().to_pandas() + pd_result = pd_df.reset_index() + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_column_multi_index_binary_op(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = (bf_df[("a", "a")] + 3).to_pandas() + pd_result = pd_df[("a", "a")] + 3 + + pandas.testing.assert_series_equal(bf_result, pd_result) + + +def test_column_multi_index_agg(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.agg(["sum", "mean"]).to_pandas() + pd_result = pd_df.agg(["sum", "mean"]) + + # Pandas may produce narrower numeric types, but bigframes always produces Float64 + pd_result = pd_result.astype("Float64") + pandas.testing.assert_frame_equal(bf_result, pd_result, check_index_type=False) + + +def test_column_multi_index_prefix_suffix(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.add_prefix("prefixed_").add_suffix("_suffixed").to_pandas() + pd_result = pd_df.add_prefix("prefixed_").add_suffix("_suffixed") + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index): + if pandas.__version__.startswith("1."): + pytest.skip("pandas 1.x. does not handle nullable ints properly in cumsum") + columns = ["int64_too", "int64_col", "float64_col"] + multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.cumsum().to_pandas() + pd_result = pd_df.cumsum() + + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index 96697dbcab..9a3d55aed2 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -20,6 +20,7 @@ import google.auth.exceptions import pytest +import bigframes.core.global_session import bigframes.pandas as bpd @@ -289,7 +290,7 @@ def test_reset_session_after_bq_session_ended(): # Now try to reset session and verify that it works bpd.reset_session() - assert bpd._global_session is None + assert bigframes.core.global_session._global_session is None # Now verify that use is able to start over df = bpd.read_gbq(test_query) @@ -332,7 +333,7 @@ def test_reset_session_after_credentials_need_reauthentication(monkeypatch): # Now verify that resetting the session works bpd.reset_session() - assert bpd._global_session is None + assert bigframes.core.global_session._global_session is None # Now verify that use is able to start over df = bpd.read_gbq(test_query) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index 47d758763b..e40addc4eb 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from google.cloud import bigquery +from ibis.backends.bigquery import datatypes as bq_types +from ibis.expr import datatypes as ibis_types import pandas as pd import pytest import bigframes -from bigframes.remote_function import read_gbq_function, remote_function +from bigframes import remote_function as rf from tests.system.utils import assert_pandas_df_equal_ignore_ordering @@ -98,6 +101,16 @@ def session_with_bq_connection_location_project_specified( ) +def test_supported_types_correspond(): + # The same types should be representable by the supported Python and BigQuery types. + ibis_types_from_python = {ibis_types.dtype(t) for t in rf.SUPPORTED_IO_PYTHON_TYPES} + ibis_types_from_bigquery = { + bq_types.BigQueryType.to_ibis(tk) for tk in rf.SUPPORTED_IO_BIGQUERY_TYPEKINDS + } + + assert ibis_types_from_python == ibis_types_from_bigquery + + @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_no_session_param( bigquery_client, @@ -107,7 +120,7 @@ def test_remote_function_direct_no_session_param( dataset_id_permanent, bq_cf_connection, ): - @remote_function( + @rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -157,7 +170,7 @@ def test_remote_function_direct_no_session_param_location_specified( dataset_id_permanent, bq_cf_connection_location, ): - @remote_function( + @rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -205,7 +218,7 @@ def test_remote_function_direct_no_session_param_location_mismatched( ): with pytest.raises(ValueError): - @remote_function( + @rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -229,7 +242,7 @@ def test_remote_function_direct_no_session_param_location_project_specified( dataset_id_permanent, bq_cf_connection_location_project, ): - @remote_function( + @rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -277,7 +290,7 @@ def test_remote_function_direct_no_session_param_project_mismatched( ): with pytest.raises(ValueError): - @remote_function( + @rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -294,7 +307,7 @@ def square(x): @pytest.mark.flaky(retries=2, delay=120) def test_remote_function_direct_session_param(session_with_bq_connection, scalars_dfs): - @remote_function( + @rf.remote_function( [int], int, session=session_with_bq_connection, @@ -500,6 +513,18 @@ def add_one(x): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_detects_invalid_function(bigquery_client, dataset_id): + dataset_ref = bigquery.DatasetReference.from_string(dataset_id) + with pytest.raises(ValueError) as e: + rf.read_gbq_function( + str(dataset_ref.routine("not_a_function")), + bigquery_client=bigquery_client, + ) + + assert "Unknown function" in str(e.value) + + @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_like_original( bigquery_client, @@ -509,7 +534,7 @@ def test_read_gbq_function_like_original( dataset_id_permanent, bq_cf_connection, ): - @remote_function( + @rf.remote_function( [int], int, bigquery_client=bigquery_client, @@ -522,7 +547,7 @@ def test_read_gbq_function_like_original( def square1(x): return x * x - square2 = read_gbq_function( + square2 = rf.read_gbq_function( function_name=square1.bigframes_remote_function, bigquery_client=bigquery_client, ) @@ -551,3 +576,111 @@ def square1(x): s2_result = int64_col_filtered.to_frame().assign(result=s2_result_col) assert_pandas_df_equal_ignore_ordering(s1_result.to_pandas(), s2_result.to_pandas()) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_reads_udfs(bigquery_client, scalars_dfs, dataset_id): + dataset_ref = bigquery.DatasetReference.from_string(dataset_id) + arg = bigquery.RoutineArgument( + name="x", + data_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), + ) + sql_routine = bigquery.Routine( + dataset_ref.routine("square_sql"), + body="x * x", + arguments=[arg], + return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), + type_=bigquery.RoutineType.SCALAR_FUNCTION, + ) + js_routine = bigquery.Routine( + dataset_ref.routine("square_js"), + body="return x * x", + language="JAVASCRIPT", + arguments=[arg], + return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), + type_=bigquery.RoutineType.SCALAR_FUNCTION, + ) + + for routine in (sql_routine, js_routine): + # Create the routine in BigQuery and read it back using read_gbq_function. + bigquery_client.create_routine(routine, exists_ok=True) + square = rf.read_gbq_function( + str(routine.reference), bigquery_client=bigquery_client + ) + + # It should point to the named routine and yield the expected results. + assert square.bigframes_remote_function == str(routine.reference) + + src = {"x": [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]} + + routine_ref_str = rf.routine_ref_to_string_for_query(routine.reference) + direct_sql = " UNION ALL ".join( + [f"SELECT {x} AS x, {routine_ref_str}({x}) AS y" for x in src["x"]] + ) + direct_df = bigquery_client.query(direct_sql).to_dataframe() + + indirect_df = bigframes.dataframe.DataFrame(src) + indirect_df = indirect_df.assign(y=indirect_df.x.apply(square)) + indirect_df = indirect_df.to_pandas() + + assert_pandas_df_equal_ignore_ordering(direct_df, indirect_df) + + +@pytest.mark.flaky(retries=2, delay=120) +def test_read_gbq_function_enforces_explicit_types(bigquery_client, dataset_id): + dataset_ref = bigquery.DatasetReference.from_string(dataset_id) + typed_arg = bigquery.RoutineArgument( + name="x", + data_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), + ) + untyped_arg = bigquery.RoutineArgument( + name="x", + kind="ANY_TYPE", # With this kind, data_type not required for SQL functions. + ) + + both_types_specified = bigquery.Routine( + dataset_ref.routine("both_types_specified"), + body="x * x", + arguments=[typed_arg], + return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), + type_=bigquery.RoutineType.SCALAR_FUNCTION, + ) + only_return_type_specified = bigquery.Routine( + dataset_ref.routine("only_return_type_specified"), + body="x * x", + arguments=[untyped_arg], + return_type=bigquery.StandardSqlDataType(bigquery.StandardSqlTypeNames.INT64), + type_=bigquery.RoutineType.SCALAR_FUNCTION, + ) + only_arg_type_specified = bigquery.Routine( + dataset_ref.routine("only_arg_type_specified"), + body="x * x", + arguments=[typed_arg], + type_=bigquery.RoutineType.SCALAR_FUNCTION, + ) + neither_type_specified = bigquery.Routine( + dataset_ref.routine("neither_type_specified"), + body="x * x", + arguments=[untyped_arg], + type_=bigquery.RoutineType.SCALAR_FUNCTION, + ) + + bigquery_client.create_routine(both_types_specified, exists_ok=True) + bigquery_client.create_routine(only_return_type_specified, exists_ok=True) + bigquery_client.create_routine(only_arg_type_specified, exists_ok=True) + bigquery_client.create_routine(neither_type_specified, exists_ok=True) + + rf.read_gbq_function( + str(both_types_specified.reference), bigquery_client=bigquery_client + ) + rf.read_gbq_function( + str(only_return_type_specified.reference), bigquery_client=bigquery_client + ) + with pytest.raises(ValueError): + rf.read_gbq_function( + str(only_arg_type_specified.reference), bigquery_client=bigquery_client + ) + with pytest.raises(ValueError): + rf.read_gbq_function( + str(neither_type_specified.reference), bigquery_client=bigquery_client + ) diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index 2fc34f9bae..d825c62561 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import io import random import tempfile import textwrap @@ -683,6 +684,43 @@ def test_read_csv_local_w_encoding(session, penguins_pandas_df_default_index, en assert df.shape[0] == penguins_pandas_df_default_index.shape[0] +def test_read_pickle_local(session, penguins_pandas_df_default_index, tmp_path): + path = tmp_path / "test_read_csv_local_w_encoding.pkl" + + penguins_pandas_df_default_index.to_pickle(path) + df = session.read_pickle(path) + + pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) + + +def test_read_pickle_buffer(session, penguins_pandas_df_default_index): + buffer = io.BytesIO() + penguins_pandas_df_default_index.to_pickle(buffer) + buffer.seek(0) + df = session.read_pickle(buffer) + + pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) + + +def test_read_pickle_series_buffer(session): + pd_series = pd.Series([1, 2, 3, 4, 5], dtype="Int64") + buffer = io.BytesIO() + pd_series.to_pickle(buffer) + buffer.seek(0) + bf_series = session.read_pickle(buffer).to_pandas() + pd_series.index = pd_series.index.astype("Int64") + + assert (pd_series == bf_series).all() + + +def test_read_pickle_gcs(session, penguins_pandas_df_default_index, gcs_folder): + path = gcs_folder + "test_read_pickle_gcs.pkl" + penguins_pandas_df_default_index.to_pickle(path) + df = session.read_pickle(path) + + pd.testing.assert_frame_equal(penguins_pandas_df_default_index, df.to_pandas()) + + def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder): scalars_df, _ = scalars_dfs # Include wildcard so that multiple files can be written/read if > 1 GB. diff --git a/tests/unit/test_dtypes.py b/tests/unit/test_dtypes.py index dafed08980..bb8ae570dc 100644 --- a/tests/unit/test_dtypes.py +++ b/tests/unit/test_dtypes.py @@ -175,9 +175,12 @@ def test_literal_to_ibis_scalar_throws_on_incompatible_literal(): def test_remote_function_io_types_are_supported_bigframes_types(): + from ibis.expr.datatypes.core import dtype as python_type_to_bigquery_type + from bigframes.remote_function import ( - _supported_io_ibis_types as rf_supported_io_ibis_types, + SUPPORTED_IO_PYTHON_TYPES as rf_supported_io_types, ) - for ibis_type in rf_supported_io_ibis_types: + for python_type in rf_supported_io_types: + ibis_type = python_type_to_bigquery_type(python_type) assert ibis_type in bigframes.dtypes.IBIS_TO_BIGFRAMES diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 653b65c834..6762afc61f 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -44,6 +44,7 @@ def axes(self) -> list: They are returned in that order. Examples + .. code-block:: df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) @@ -175,8 +176,8 @@ def assign(self, **kwargs) -> DataFrame: are simply assigned to the column. Returns: - DataFrame: A new DataFrame with the new columns in addition to - all the existing columns. + bigframes.dataframe.DataFrame: A new DataFrame with the new columns + in addition to all the existing columns. """ raise NotImplementedError("abstract method") @@ -205,7 +206,7 @@ def drop( level: For MultiIndex, level from which the labels will be removed. Returns: - DataFrame: DataFrame without the removed column labels. + bigframes.dataframe.DataFrame: DataFrame without the removed column labels. Raises: KeyError: If any of the labels is not found in the selected axis. @@ -227,7 +228,7 @@ def rename( Dict-like from old column labels to new column labels. Returns: - DataFrame: DataFrame with the renamed axis labels. + bigframes.dataframe.DataFrame: DataFrame with the renamed axis labels. Raises: KeyError: If any of the labels is not found. @@ -238,7 +239,7 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: """ Set the name of the axis for the index. - .. Note:: + .. note:: Currently only accepts a single string parameter (the new name of the index). @@ -247,7 +248,7 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Value to set the axis name attribute. Returns: - DataFrame: DataFrame with the new index name + bigframes.dataframe.DataFrame: DataFrame with the new index name """ raise NotImplementedError("abstract method") @@ -317,7 +318,7 @@ def reset_index( the index to the default integer index. Returns: - DataFrame: DataFrame with the new index. + bigframes.dataframe.DataFrame: DataFrame with the new index. """ raise NotImplementedError("abstract method") @@ -344,7 +345,7 @@ def drop_duplicates( - ``False`` : Drop all duplicates. Returns: - DataFrame: DataFrame with duplicates removed + bigframes.dataframe.DataFrame: DataFrame with duplicates removed """ raise NotImplementedError("abstract method") @@ -366,7 +367,7 @@ def duplicated(self, subset=None, keep="first"): - False : Mark all duplicates as ``True``. Returns: - Boolean series for each duplicated rows. + bigframes.series.Series: Boolean series for each duplicated rows. """ raise NotImplementedError("abstract method") @@ -379,7 +380,7 @@ def dropna( """Remove missing values. Returns: - DataFrame: DataFrame with NA entries dropped from it. + bigframes.dataframe.DataFrame: DataFrame with NA entries dropped from it. """ raise NotImplementedError("abstract method") @@ -844,7 +845,7 @@ def groupby( values will also be treated as the key in groups. Returns: - A groupby object that contains information about the groups. + bigframes.core.groupby.SeriesGroupBy: A groupby object that contains information about the groups. """ raise NotImplementedError("abstract method") @@ -871,7 +872,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: values, without passing them to func. Returns: - DataFrame: Transformed DataFrame. + bigframes.dataframe.DataFrame: Transformed DataFrame. """ raise NotImplementedError("abstract method") @@ -899,7 +900,7 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: index, preserving the order of the calling's one. Returns: - DataFrame: A dataframe containing columns from both the caller and `other`. + bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`. """ raise NotImplementedError("abstract method") @@ -969,7 +970,7 @@ def merge( no suffix. At least one of the values must not be None. Returns: - DataFrame: A DataFrame of the two merged objects. + bigframes.dataframe.DataFrame: A DataFrame of the two merged objects. """ raise NotImplementedError("abstract method") @@ -1006,7 +1007,7 @@ def all(self, *, bool_only: bool = False): Include only boolean columns. Returns: - Series + bigframes.series.Series: Series if all elements are True. """ raise NotImplementedError("abstract method") @@ -1019,7 +1020,7 @@ def prod(self, *, numeric_only: bool = False): Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series with the product of the values. """ raise NotImplementedError("abstract method") @@ -1034,7 +1035,7 @@ def min(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series with the minimum of the values. """ raise NotImplementedError("abstract method") @@ -1049,7 +1050,7 @@ def max(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series after the maximum of values. """ raise NotImplementedError("abstract method") @@ -1063,7 +1064,7 @@ def sum(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series with the sum of values. """ raise NotImplementedError("abstract method") @@ -1075,7 +1076,7 @@ def mean(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series with the mean of values. """ raise NotImplementedError("abstract method") @@ -1090,7 +1091,7 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): one. Note: ``exact=True`` not yet supported. Returns: - Series + bigframes.series.Series: Series with the median of values. """ raise NotImplementedError("abstract method") @@ -1104,7 +1105,7 @@ def var(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series with unbiased variance over requested axis. """ raise NotImplementedError("abstract method") @@ -1118,7 +1119,7 @@ def std(self, *, numeric_only: bool = False): Default False. Include only float, int, boolean columns. Returns: - Series + bigframes.series.Series: Series with sample standard deviation. """ raise NotImplementedError("abstract method") @@ -1134,8 +1135,8 @@ def count(self, *, numeric_only: bool = False): Include only `float`, `int` or `boolean` data. Returns: - For each column/row the number of non-NA/null entries. - If `level` is specified returns a `DataFrame`. + bigframes.series.Series: For each column/row the number of + non-NA/null entries. If `level` is specified returns a `DataFrame`. """ raise NotImplementedError("abstract method") @@ -1143,10 +1144,8 @@ def nunique(self): """ Count number of distinct elements in specified axis. - Return Series with number of distinct elements. - Returns: - Series + bigframes.series.Series: Series with number of distinct elements. """ raise NotImplementedError("abstract method") @@ -1156,7 +1155,7 @@ def cummin(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative minimum. Returns: - DataFrame: Return cumulative minimum of DataFrame. + bigframes.dataframe.DataFrame: Return cumulative minimum of DataFrame. """ raise NotImplementedError("abstract method") @@ -1166,7 +1165,7 @@ def cummax(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative maximum. Returns: - DataFrame: Return cumulative maximum of DataFrame. + bigframes.dataframe.DataFrame: Return cumulative maximum of DataFrame. """ raise NotImplementedError("abstract method") @@ -1176,7 +1175,7 @@ def cumsum(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative sum. Returns: - DataFrame: Return cumulative sum of DataFrame. + bigframes.dataframe.DataFrame: Return cumulative sum of DataFrame. """ raise NotImplementedError("abstract method") @@ -1186,7 +1185,7 @@ def cumprod(self) -> DataFrame: Returns a DataFrame of the same size containing the cumulative product. Returns: - DataFrame: Return cumulative product of DataFrame. + bigframes.dataframe.DataFrame: Return cumulative product of DataFrame. """ raise NotImplementedError("abstract method") @@ -1201,7 +1200,7 @@ def agg(self, func): function names, e.g. ``['sum', 'mean']``. Returns: - Series or DataFrame: Aggregated results + DataFrame or bigframes.series.Series: Aggregated results. """ raise NotImplementedError("abstract method") @@ -1218,17 +1217,51 @@ def describe(self): .. note:: Percentile values are approximates only. + .. note:: + For numeric data, the result's index will include ``count``, + ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and + upper percentiles. By default the lower percentile is ``25`` and the + upper percentile is ``75``. The ``50`` percentile is the + same as the median. + Returns: - Summary statistics of the Series or Dataframe provided. + bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided. + """ + raise NotImplementedError("abstract method") + + def pivot(self, *, columns, index=None, values=None): + """ + Return reshaped DataFrame organized by given index / column values. + + Reshape data (produce a "pivot" table) based on column values. Uses + unique values from specified `index` / `columns` to form axes of the + resulting DataFrame. This function does not support data + aggregation, multiple values will result in a MultiIndex in the + columns. + + .. note:: + BigQuery supports up to 10000 columns. Pivot operations on columns + with too many unique values will fail if they would exceed this limit. + + .. note:: + The validity of the pivot operation is not checked. If columns and index + do not together uniquely identify input rows, the output will be + silently non-deterministic. + Args: + columns (str or object or a list of str): + Column to use to make new frame's columns. - Notes - ----- - For numeric data, the result's index will include ``count``, - ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and - upper percentiles. By default the lower percentile is ``25`` and the - upper percentile is ``75``. The ``50`` percentile is the - same as the median. + index (str or object or a list of str, optional): + Column to use to make new frame's index. If not given, uses existing index. + + values (str, object or a list of the previous, optional): + Column(s) to use for populating new frame's values. If not + specified, all remaining columns will be used and the result will + have hierarchically indexed columns. + + Returns: + Returns reshaped DataFrame. """ raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 039dc1eae0..79eb402696 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -211,7 +211,7 @@ def to_frame(self) -> DataFrame: Convert Series to DataFrame. Returns: - DataFrame: DataFrame representation of Series. + bigframes.dataframe.DataFrame: DataFrame representation of Series. """ raise NotImplementedError("abstract method") @@ -419,7 +419,7 @@ def mode(self) -> Series: Always returns Series even if only one value is returned. Returns: - Series: Modes of the Series in sorted order. + bigframes.series.Series: Modes of the Series in sorted order. """ raise NotImplementedError("abstract method") @@ -440,7 +440,7 @@ def drop_duplicates( ``False`` : Drop all duplicates. Returns: - Series: Series with duplicates dropped or None if ``inplace=True``. + bigframes.series.Series: Series with duplicates dropped or None if ``inplace=True``. """ raise NotImplementedError("abstract method") @@ -463,8 +463,8 @@ def duplicated(self, keep="first") -> Series: ``False`` : Mark all duplicates as ``True``. Returns: - Series: Series indicating whether each value has occurred in the - preceding values. + bigframes.series.Series: Series indicating whether each value has occurred in the + preceding values. """ raise NotImplementedError("abstract method") @@ -478,7 +478,7 @@ def round(self, decimals: int = 0) -> Series: it specifies the number of positions to the left of the decimal point. Returns: - Series: Rounded values of the Series. + bigframes.series.Series: Rounded values of the Series. """ raise NotImplementedError("abstract method") @@ -569,7 +569,7 @@ def sort_values( the end. Returns: - Series or None: Series ordered by values or None if ``inplace=True``. + bigframes.series.Series: Series ordered by values or None if ``inplace=True``. """ raise NotImplementedError("abstract method") @@ -597,7 +597,7 @@ def sort_index( Not implemented for MultiIndex. Returns: - Series or None: The original Series sorted by the labels or None if + bigframes.series.Series: The original Series sorted by the labels or None if ``inplace=True``. """ @@ -624,7 +624,7 @@ def nlargest( size larger than `n`. Returns: - Series: The `n` largest values in the Series, sorted in decreasing order. + bigframes.series.Series: The `n` largest values in the Series, sorted in decreasing order. """ raise NotImplementedError("abstract method") @@ -647,7 +647,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: size larger than `n`. Returns: - Series: The `n` smallest values in the Series, sorted in increasing order. + bigframes.series.Series: The `n` smallest values in the Series, sorted in increasing order. """ raise NotImplementedError("abstract method") @@ -669,7 +669,7 @@ def apply( Python function or NumPy ufunc to apply. Returns: - Series or DataFrame: If func returns a Series object the result + bigframes.series.Series: If func returns a Series object the result will be a DataFrame. """ raise NotImplementedError("abstract method") @@ -723,7 +723,8 @@ def groupby( If False, NA values will also be treated as the key in groups. Returns: - SeriesGroupBy: Returns a groupby object that contains information about the groups. + bigframes.core.groupby.SeriesGroupBy: Returns a groupby object that contains + information about the groups. """ raise NotImplementedError("abstract method") @@ -750,15 +751,12 @@ def drop( level: For MultiIndex, level for which the labels will be removed. - Returns - ------- - Series or None - Series with specified index labels removed or None if ``inplace=True``. + Returns: + bigframes.series.Series: Series with specified index labels removed + or None if ``inplace=True``. - Raises - ------ - KeyError - If none of the labels are found in the index. + Raises: + KeyError: If none of the labels are found in the index. """ raise NotImplementedError("abstract method") @@ -844,7 +842,7 @@ def cumprod(self): product. Returns: - Return cumulative sum of scalar or Series. + bigframes.series.Series: Return cumulative sum of scalar or Series. """ raise NotImplementedError("abstract method") @@ -878,7 +876,7 @@ def cummax(self): For `Series` this parameter is unused and defaults to 0. Returns: - scalar or Series: Return cumulative maximum of scalar or Series. + bigframes.series.Series: Return cumulative maximum of scalar or Series. """ raise NotImplementedError("abstract method") @@ -901,7 +899,7 @@ def cummin(self): compatibility with NumPy. Returns: - scalar or Series: Return cumulative minimum of scalar or Series. + bigframes.series.Series: Return cumulative minimum of scalar or Series. """ raise NotImplementedError("abstract method") @@ -930,7 +928,7 @@ def ne(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -945,7 +943,7 @@ def le(self, other) -> Series: other: Series, or scalar value Returns: - Series. The result of the comparison. + bigframes.series.Series. The result of the comparison. """ raise NotImplementedError("abstract method") @@ -960,7 +958,7 @@ def lt(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -975,7 +973,7 @@ def ge(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -990,7 +988,7 @@ def gt(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1005,7 +1003,7 @@ def add(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1020,7 +1018,7 @@ def radd(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1038,7 +1036,7 @@ def sub( other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1053,7 +1051,7 @@ def rsub(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1068,7 +1066,7 @@ def mul(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1097,7 +1095,7 @@ def truediv(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1112,7 +1110,7 @@ def rtruediv(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1127,7 +1125,7 @@ def floordiv(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1142,7 +1140,7 @@ def rfloordiv(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1157,7 +1155,7 @@ def mod(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1172,7 +1170,7 @@ def rmod(self, other) -> Series: other (Series, or scalar value): Returns: - Series: The result of the operation. + bigframes.series.Series: The result of the operation. """ raise NotImplementedError("abstract method") @@ -1373,7 +1371,7 @@ def where(self, cond, other): extension dtypes). Returns: - Series + bigframes.series.Series: Series after the replacement. """ raise NotImplementedError("abstract method") @@ -1397,7 +1395,7 @@ def mask(self, cond, other): extension dtypes). Returns: - Series + bigframes.series.Series: Series after the replacement. """ raise NotImplementedError("abstract method") @@ -1460,7 +1458,7 @@ def rename(self, index, **kwargs) -> Series | None: attribute. Returns: - Series: Series with index labels + bigframes.series.Series: Series with index labels. """ raise NotImplementedError("abstract method") @@ -1474,7 +1472,7 @@ def rename_axis(self, mapper, **kwargs): Value to set the axis name attribute. Returns: - Series: Series with the name of the axis set. + bigframes.series.Series: Series with the name of the axis set. """ raise NotImplementedError("abstract method") @@ -1514,7 +1512,8 @@ def rolling( to the size of the window. Returns: - ``Window`` subclass if a ``win_type`` is passed.``Rolling`` subclass if ``win_type`` is not passed + bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed. + ``Rolling`` subclass if ``win_type`` is not passed. """ raise NotImplementedError("abstract method") @@ -1528,7 +1527,7 @@ def expanding(self, min_periods=1): otherwise, result is ``np.nan``. Returns: - ``Expanding`` subclass + bigframes.core.window.Window: ``Expanding`` subclass. """ raise NotImplementedError("abstract method") @@ -1591,7 +1590,7 @@ def isin(self, values): TypeError. Instead, turn a single string into a list of one element. Returns: - bigframes.series.Series: Series of booleans indicating if each element is in values. + bigframes.series.Series: Series of booleans indicating if each element is in values. Raises: TypeError: If input is not list-like. diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index a27093b552..5e3d0b047f 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -136,6 +136,102 @@ def isnumeric(self): raise NotImplementedError("abstract method") + def isalpha(self): + """Check whether all characters in each string are alphabetic. + + This is equivalent to running the Python string method + :meth:`str.isalpha` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series with the same length as the originalSeries/Index. + """ + + raise NotImplementedError("abstract method") + + def isdigit(self): + """Check whether all characters in each string are digits. + + This is equivalent to running the Python string method + :meth:`str.isdigit` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series with the same length as the originalSeries/Index. + """ + + raise NotImplementedError("abstract method") + + def isalnum(self): + """Check whether all characters in each string are alphanumeric. + + This is equivalent to running the Python string method + :meth:`str.isalnum` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series or Index of boolean values with the + same length as the original Series/Index. + """ + + raise NotImplementedError("abstract method") + + def isspace(self): + """Check whether all characters in each string are whitespace. + + This is equivalent to running the Python string method + :meth:`str.isspace` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series or Index of boolean values with the + same length as the original Series/Index. + """ + + raise NotImplementedError("abstract method") + + def islower(self): + """Check whether all characters in each string are lowercase. + + This is equivalent to running the Python string method + :meth:`str.islower` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series or Index of boolean values with the + same length as the original Series/Index. + """ + + raise NotImplementedError("abstract method") + + def isupper(self): + """Check whether all characters in each string are uppercase. + + This is equivalent to running the Python string method + :meth:`str.isupper` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series or Index of boolean values with the + same length as the original Series/Index. + """ + + raise NotImplementedError("abstract method") + + def isdecimal(self): + """Check whether all characters in each string are decimal. + + This is equivalent to running the Python string method + :meth:`str.isdecimal` for each element of the Series/Index. If a string + has zero characters, ``False`` is returned for that check. + + Returns: + bigframes.series.Series: Series or Index of boolean values with the + same length as the original Series/Index. + """ + + raise NotImplementedError("abstract method") + def rstrip(self): """Remove trailing characters. @@ -427,3 +523,47 @@ def rjust( bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ raise NotImplementedError("abstract method") + + def zfill( + self, + width: int, + ): + """ + Pad strings in the Series/Index by prepending '0' characters. + + Strings in the Series/Index are padded with '0' characters on the + left of the string to reach a total string length `width`. Strings + in the Series/Index with length greater or equal to `width` are + unchanged. + + Args: + width (int): + Minimum length of resulting string; strings with length less + than `width` be prepended with '0' characters. + + Returns: + bigframes.series.Series: Series of objects. + """ + raise NotImplementedError("abstract method") + + def center( + self, + width: int, + fillchar: str = " ", + ): + """ + Pad left and right side of strings in the Series/Index. + + Equivalent to :meth:`str.center`. + + Args: + width (int): + Minimum width of resulting string; additional characters will be filled + with character defined in `fillchar`. + fillchar (str, default ' '): + Additional character for filling, default is whitespace. + + Returns: + bigframes.series.Series: Returns Series or Index with minimum number of char in object. + """ + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 9425ead0e3..730872034d 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -15,7 +15,56 @@ def read_gbq( col_order: Iterable[str] = (), max_results: Optional[int] = None, ): - """Loads DataFrame from BigQuery. + """Loads a DataFrame from BigQuery. + + BigQuery tables are an unordered, unindexed data source. By default, + the DataFrame will have an arbitrary index and ordering. + + Set the `index_col` argument to one or more columns to choose an + index. The resulting DataFrame is sorted by the index columns. For the + best performance, ensure the index columns don't contain duplicate + values. + + .. note:: + By default, even SQL query inputs with an ORDER BY clause create a + DataFrame with an arbitrary ordering. Use ``row_number() OVER + (ORDER BY ...) AS rowindex`` in your SQL query and set + ``index_col='rowindex'`` to preserve the desired ordering. + + If your query doesn't have an ordering, select ``GENERATE_UUID() AS + rowindex`` in your SQL and set ``index_col='rowindex'`` for the + best performance. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> bpd.options.display.progress_bar = None + + Preserve ordering in a query input. + + >>> bpd.read_gbq(''' + ... SELECT + ... -- Instead of an ORDER BY clause on the query, use + ... -- ROW_NUMBER() to create an ordered DataFrame. + ... ROW_NUMBER() OVER (ORDER BY AVG(pitchSpeed) DESC) + ... AS rowindex, + ... + ... pitcherFirstName, + ... pitcherLastName, + ... AVG(pitchSpeed) AS averagePitchSpeed + ... FROM `bigquery-public-data.baseball.games_wide` + ... WHERE year = 2016 + ... GROUP BY pitcherFirstName, pitcherLastName + ... ''', index_col="rowindex").head(n=5) + pitcherFirstName pitcherLastName averagePitchSpeed + rowindex + 1 Albertin Chapman 96.514113 + 2 Zachary Britton 94.591039 + 3 Trevor Rosenthal 94.213953 + 4 Jose Torres 94.103448 + 5 Tayron Guerrero 93.863636 + + [5 rows x 3 columns] Args: query (str): diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py new file mode 100644 index 0000000000..a160ef0c4e --- /dev/null +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -0,0 +1,55 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/io/pickle.py +""" pickle compat """ +from __future__ import annotations + +from pandas._typing import ( + CompressionOptions, + FilePath, + ReadPickleBuffer, + StorageOptions, +) + + +class PickleIOMixin: + def read_pickle( + self, + filepath_or_buffer: FilePath | ReadPickleBuffer, + compression: CompressionOptions = "infer", + storage_options: StorageOptions = None, + ): + """Load pickled BigFrames object (or any object) from file. + + .. note:: + If the content of the pickle file is a Series and its name attribute is None, + the name will be set to '0' by default. + + Args: + filepath_or_buffer (str, path object, or file-like object): + String, path object (implementing os.PathLike[str]), or file-like object + implementing a binary readlines() function. Also accepts URL. URL is not + limited to S3 and GCS. + compression (str or dict, default 'infer'): + For on-the-fly decompression of on-disk data. If 'infer' and + 'filepath_or_buffer' is path-like, then detect compression from the following + extensions: '.gz', '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' + or '.tar.bz2' (otherwise no compression). If using 'zip' or 'tar', the ZIP + file must contain only one data file to be read in. Set to None for no + decompression. Can also be a dict with key 'method' set to one of {'zip', + 'gzip', 'bz2', 'zstd', 'tar'} and other key-value pairs are forwarded to + zipfile.ZipFile, gzip.GzipFile, bz2.BZ2File, zstandard.ZstdDecompressor or + tarfile.TarFile, respectively. As an example, the following could be passed + for Zstandard decompression using a custom compression dictionary + compression={'method': 'zstd', 'dict_data': my_compression_dict}. + storage_options (dict, default None): + Extra options that make sense for a particular storage connection, e.g. host, + port, username, password, etc. For HTTP(S) URLs the key-value pairs are + forwarded to urllib.request.Request as header options. For other URLs (e.g. + starting with “s3://”, and “gcs://”) the key-value pairs are forwarded to + fsspec.open. Please see fsspec and urllib for more details, and for more + examples on storage options refer here. + + Returns: + bigframes.dataframe.DataFrame or bigframes.series.Series: same type as object + stored in file. + """ + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index 03958f7595..fc48cde85b 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -113,7 +113,7 @@ def score(self, X, y): ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted`` is the number of samples used in the fitting for the estimator. - y (bigframes.dataframe.DataFrame or bigframes.series.Series: + y (bigframes.dataframe.DataFrame or bigframes.series.Series): Series or DataFrame of shape (n_samples,) or (n_samples, n_outputs). True values for `X`. @@ -123,6 +123,27 @@ def score(self, X, y): raise NotImplementedError("abstract method") +class TransformerMixin: + """Mixin class for all transformers.""" + + def fit_transform(self, X, y=None): + """Fit to data, then transform it. + + Args: + X (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples, n_features). + Input samples. + + y (bigframes.dataframe.DataFrame or bigframes.series.Series): + Series or DataFrame of shape (n_samples,) or (n_samples, n_outputs). Default None. + Target values (None for unsupervised transformations). + + Returns: + bigframes.dataframe.DataFrame: DataFrame of shape (n_samples, n_features_new) + Transformed DataFrame. + """ + + class MetaEstimatorMixin: _required_parameters = ["estimator"] """Mixin class for all meta estimators in scikit-learn.""" diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index 068aa4d290..ea4df0dc02 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -76,8 +76,6 @@ def predict( Args: X (bigframes.dataframe.DataFrame or bigframes.series.Series): DataFrame of shape (n_samples, n_features). New data to predict. - y: (default None) - Not used, present here for API consistency by convention. Returns: bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to. From 89b95033d6b449bfc21249057d7c024d096c80d0 Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 29 Aug 2023 11:37:20 -0500 Subject: [PATCH 2/7] chore: sync changes from internal repo (#10) feat: support `Series.corr` fix: raise AttributeError for unimplemented pandas methods feat: support `DataFrame.stack` feat: support `np.arcsin`, `np.arccos`, `np.arctan`, `np.sinh`, `np.cosh`, `np.tanh`, `np.arcsinh`, `np.arccosh`, `np.arctanh`, `np.exp` with Series argument fix: align column names with pandas in `DataFrame.agg` results docs: set `options.bigquery.project` in sample code chore: unit test internal `get_standardized_ids` method fix: include survey link in abstract `NotImplementedError` exception messages perf: lazily instantiate client library objects fix: allow (but still not recommended) `ORDER BY` in `read_gbq` input when an `index_col` is defined feat: support `read_json` with `engine=bigquery` for newline-delimited JSON files chore: remove unneeded `types-retry` reference feat: support `np.sin`, `np.cos`, `np.tan`, `np.log`, `np.log10`, `np.sqrt`, `np.abs` with Series argument fix: label temp table creation jobs with `source=bigquery-dataframes-temp` label fix: support spaces in column names in `DataFrame` initializater chore: fix permissions on publish docs script feat: support `df[my_column] = [a python list]` feat: add `components_`, `explained_variance_`, and `explained_variance_ratio_` properties to `bigframes.ml.decomposition.PCA` chore: add execute permissions on publish docs script docs: fix link to GitHub chore: fix docs build fix: check for IAM role on the BigQuery connection when initializing a `remote_function` chore: revert pin to maximum pytest-retry plugin version in tests --- .kokoro/docs/common.cfg | 2 +- .kokoro/publish-docs.sh | 0 README.rst | 9 +- bigframes/constants.py | 2 + bigframes/core/__init__.py | 102 +- bigframes/core/block_transforms.py | 32 + bigframes/core/blocks.py | 159 ++- bigframes/core/groupby/__init__.py | 23 +- bigframes/core/indexes/__init__.py | 3 - bigframes/core/utils.py | 53 + bigframes/dataframe.py | 167 ++- bigframes/ml/core.py | 32 +- bigframes/ml/decomposition.py | 29 + bigframes/ml/sql.py | 10 + bigframes/operations/__init__.py | 163 +++ bigframes/operations/base.py | 5 + bigframes/pandas/__init__.py | 31 +- bigframes/remote_function.py | 109 +- bigframes/series.py | 55 +- bigframes/session.py | 467 ++++-- mypy.ini | 3 + notebooks/dataframes/dataframe.ipynb | 1249 +++++++++++++++-- .../bq_dataframes_ml_linear_regression.ipynb | 6 +- .../getting_started_bq_dataframes.ipynb | 6 +- noxfile.py | 8 +- setup.py | 2 + testing/constraints-3.9.txt | 2 + tests/system/conftest.py | 77 + tests/system/small/ml/conftest.py | 43 +- tests/system/small/ml/test_core.py | 94 ++ tests/system/small/ml/test_decomposition.py | 109 +- tests/system/small/test_dataframe.py | 80 +- tests/system/small/test_dataframe_io.py | 92 +- tests/system/small/test_groupby.py | 21 +- tests/system/small/test_multiindex.py | 36 + tests/system/small/test_numpy.py | 69 + tests/system/small/test_remote_function.py | 12 + tests/system/small/test_series.py | 20 +- tests/system/small/test_session.py | 125 +- tests/unit/core/test_utils.py | 56 + tests/unit/ml/test_sql.py | 15 + .../bigframes_vendored/pandas/core/frame.py | 144 +- .../bigframes_vendored/pandas/core/generic.py | 37 +- .../pandas/core/groupby/__init__.py | 46 +- .../pandas/core/indexes/accessor.py | 23 +- .../pandas/core/indexes/base.py | 8 +- .../pandas/core/indexing.py | 6 +- .../pandas/core/reshape/concat.py | 4 +- .../pandas/core/reshape/tile.py | 4 +- .../bigframes_vendored/pandas/core/series.py | 206 +-- .../pandas/core/strings/accessor.py | 66 +- .../pandas/core/window/rolling.py | 16 +- .../bigframes_vendored/pandas/io/gbq.py | 4 +- .../bigframes_vendored/pandas/io/parquet.py | 4 +- .../pandas/io/parsers/readers.py | 82 +- .../bigframes_vendored/pandas/io/pickle.py | 4 +- .../bigframes_vendored/sklearn/base.py | 6 +- .../sklearn/cluster/_kmeans.py | 11 +- .../sklearn/compose/_column_transformer.py | 5 +- .../sklearn/decomposition/_pca.py | 54 +- .../sklearn/ensemble/_forest.py | 8 +- .../sklearn/linear_model/_base.py | 7 +- .../sklearn/linear_model/_logistic.py | 3 +- .../sklearn/metrics/_classification.py | 12 +- .../sklearn/metrics/_ranking.py | 8 +- .../sklearn/metrics/_regression.py | 4 +- .../bigframes_vendored/sklearn/pipeline.py | 7 +- .../sklearn/preprocessing/_data.py | 5 +- .../sklearn/preprocessing/_encoder.py | 5 +- .../bigframes_vendored/xgboost/sklearn.py | 6 +- 70 files changed, 3599 insertions(+), 774 deletions(-) mode change 100644 => 100755 .kokoro/publish-docs.sh create mode 100644 tests/system/small/test_numpy.py create mode 100644 tests/unit/core/test_utils.py diff --git a/.kokoro/docs/common.cfg b/.kokoro/docs/common.cfg index ce84d7ec49..bd73988540 100644 --- a/.kokoro/docs/common.cfg +++ b/.kokoro/docs/common.cfg @@ -20,7 +20,7 @@ env_vars: { } env_vars: { key: "TRAMPOLINE_BUILD_FILE" - value: "git/bigframes/.kokoro/publish-docs.sh" + value: ".kokoro/publish-docs.sh" } env_vars: { diff --git a/.kokoro/publish-docs.sh b/.kokoro/publish-docs.sh old mode 100644 new mode 100755 diff --git a/README.rst b/README.rst index 6ae3753eed..935c54cc8b 100644 --- a/README.rst +++ b/README.rst @@ -41,6 +41,7 @@ method accepts either a fully-qualified table ID or a SQL query. import bigframes.pandas as bpd + bpd.options.bigquery.project = your_gcp_project_id df1 = bpd.read_gbq("project.dataset.table") df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`") @@ -260,7 +261,7 @@ To view and manage Cloud Functions functions, use the `Functions `_ page and use the project picker to select the project in which you created the function. For easy identification, the names of the functions -created by BigQuery DataFrames are prefixed by ``bigframes-``. +created by BigQuery DataFrames are prefixed by ``bigframes``. **Requirements** @@ -283,7 +284,9 @@ following IAM roles: * BigQuery Data Editor (roles/bigquery.dataEditor) * BigQuery Connection Admin (roles/bigquery.connectionAdmin) * Cloud Functions Developer (roles/cloudfunctions.developer) -* Service Account User (roles/iam.serviceAccountUser) +* Service Account User (roles/iam.serviceAccountUser) on the + `service account ` + ``PROJECT_NUMBER-compute@developer.gserviceaccount.com`` * Storage Object Viewer (roles/storage.objectViewer) * Project IAM Admin (roles/resourcemanager.projectIamAdmin) @@ -330,7 +333,7 @@ Data processing location BigQuery DataFrames is designed for scale, which it achieves by keeping data and processing on the BigQuery service. However, you can bring data into the -memory of your client machine by calling ``.execute()`` on a DataFrame or Series +memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series object. If you choose to do this, the memory limitation of your client machine applies. diff --git a/bigframes/constants.py b/bigframes/constants.py index 3f3f155733..90837c79eb 100644 --- a/bigframes/constants.py +++ b/bigframes/constants.py @@ -21,3 +21,5 @@ "Share your usecase with the BigQuery DataFrames team at the " "https://bit.ly/bigframes-feedback%20survey." ) + +ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}" diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index 9f392ce149..d6509e4c0a 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -35,6 +35,7 @@ reencode_order_string, StringEncoding, ) +import bigframes.core.utils as utils import bigframes.dtypes import bigframes.operations as ops import bigframes.operations.aggregations as agg_ops @@ -562,6 +563,36 @@ def aggregate( ordering=ordering, ) + def corr_aggregate( + self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]] + ) -> ArrayValue: + """ + Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id. + This uses BigQuery's CORR under the hood, and thus only Pearson's method is used. + Arguments: + corr_aggregations: left_column_id, right_column_id, output_column_id tuples + """ + table = self.to_ibis_expr(ordering_mode="unordered") + stats = { + col_out: table[col_left].corr(table[col_right], how="pop") + for col_left, col_right, col_out in corr_aggregations + } + aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)} + result = table.aggregate(**aggregates) + # Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it. + ordering = ExpressionOrdering( + ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + total_ordering_columns=frozenset([ORDER_ID_COLUMN]), + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), + ) + return ArrayValue( + self._session, + result, + columns=[result[col_id] for col_id in [*stats.keys()]], + hidden_ordering_columns=[result[ORDER_ID_COLUMN]], + ordering=ordering, + ) + def project_window_op( self, column_name: str, @@ -852,38 +883,75 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal group_by=group_by, ) - def unpivot_single_row( + def unpivot( self, row_labels: typing.Sequence[typing.Hashable], - unpivot_columns: typing.Sequence[typing.Tuple[str, typing.Sequence[str]]], + unpivot_columns: typing.Sequence[ + typing.Tuple[str, typing.Sequence[typing.Optional[str]]] + ], *, + passthrough_columns: typing.Sequence[str] = (), index_col_id: str = "index", - dtype=pandas.Float64Dtype(), + dtype: typing.Union[ + bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype] + ] = pandas.Float64Dtype(), ) -> ArrayValue: - """Unpivot a single row.""" - # TODO: Generalize to multiple row input - table = self.to_ibis_expr(ordering_mode="unordered") + """ + Unpivot ArrayValue columns. + + Args: + row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument. + unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None. + passthrough_columns: Columns that will not be unpivoted. Column id will be preserved. + index_col_id (str): The column id to be used for the row labels. + dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns. + + Returns: + ArrayValue: The unpivoted ArrayValue + """ + table = self.to_ibis_expr(ordering_mode="offset_col") sub_expressions = [] - # TODO: validate all columns are equal length, as well as row labels + # Use ibis memtable to infer type of rowlabels (if possible) + # TODO: Allow caller to specify dtype + labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type() + labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type) + row_n = len(row_labels) if not all( len(source_columns) == row_n for _, source_columns in unpivot_columns ): raise ValueError("Columns and row labels must all be same length.") - # Select each column for i in range(row_n): values = [] - for result_col, source_cols in unpivot_columns: - values.append( - ops.AsTypeOp(dtype)._as_ibis(table[source_cols[i]]).name(result_col) - ) - + for j in range(len(unpivot_columns)): + result_col, source_cols = unpivot_columns[j] + col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype + if source_cols[i] is not None: + values.append( + ops.AsTypeOp(col_dtype) + ._as_ibis(table[source_cols[i]]) + .name(result_col) + ) + else: + values.append( + bigframes.dtypes.literal_to_ibis_scalar( + None, force_dtype=col_dtype + ).name(result_col) + ) + offsets_value = ( + ((table[ORDER_ID_COLUMN] * row_n) + i) + .cast(ibis_dtypes.int64) + .name(ORDER_ID_COLUMN), + ) sub_expr = table.select( - ibis_types.literal(row_labels[i]).name(index_col_id), + passthrough_columns, + bigframes.dtypes.literal_to_ibis_scalar( + row_labels[i], force_dtype=labels_dtype # type:ignore + ).name(index_col_id), *values, - ibis_types.literal(i).name(ORDER_ID_COLUMN), + offsets_value, ) sub_expressions.append(sub_expr) rotated_table = ibis.union(*sub_expressions) @@ -891,13 +959,15 @@ def unpivot_single_row( value_columns = [ rotated_table[value_col_id] for value_col_id, _ in unpivot_columns ] + passthrough_values = [rotated_table[col] for col in passthrough_columns] return ArrayValue( session=self._session, table=rotated_table, - columns=[rotated_table[index_col_id], *value_columns], + columns=[rotated_table[index_col_id], *value_columns, *passthrough_values], hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]], ordering=ExpressionOrdering( ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)], + integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True), total_ordering_columns=frozenset([ORDER_ID_COLUMN]), ), ) diff --git a/bigframes/core/block_transforms.py b/bigframes/core/block_transforms.py index b13d7bf2d3..abf8b887d8 100644 --- a/bigframes/core/block_transforms.py +++ b/bigframes/core/block_transforms.py @@ -197,3 +197,35 @@ def rank( ) return block.select_columns(rownum_col_ids).with_column_labels(labels) + + +def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"): + """ + Drop na entries from block + """ + if how == "any": + filtered_block = block + for column in block.value_columns: + filtered_block, result_id = filtered_block.apply_unary_op( + column, ops.notnull_op + ) + filtered_block = filtered_block.filter(result_id) + filtered_block = filtered_block.drop_columns([result_id]) + return filtered_block + else: # "all" + filtered_block = block + predicate = None + for column in block.value_columns: + filtered_block, partial_predicate = filtered_block.apply_unary_op( + column, ops.notnull_op + ) + if predicate: + filtered_block, predicate = filtered_block.apply_binary_op( + partial_predicate, predicate, ops.or_op + ) + else: + predicate = partial_predicate + if predicate: + filtered_block = filtered_block.filter(predicate) + filtered_block = filtered_block.select_columns(block.value_columns) + return filtered_block diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index 2731990feb..f23a4d0b5c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -80,15 +80,18 @@ def __init__( self, expr: core.ArrayValue, index_columns: Iterable[str], - column_labels: typing.Union[pd.Index, typing.Sequence[Label]], - index_labels: typing.Union[pd.Index, typing.Sequence[Label], None] = None, + column_labels: typing.Union[pd.Index, typing.Iterable[Label]], + index_labels: typing.Union[pd.Index, typing.Iterable[Label], None] = None, ): """Construct a block object, will create default index if no index columns specified.""" - if index_labels and (len(index_labels) != len(list(index_columns))): - raise ValueError( - "'index_columns' and 'index_labels' must have equal length" - ) - if len(list(index_columns)) == 0: + index_columns = list(index_columns) + if index_labels: + index_labels = list(index_labels) + if len(index_labels) != len(index_columns): + raise ValueError( + "'index_columns' and 'index_labels' must have equal length" + ) + if len(index_columns) == 0: expr, new_index_col_id = expr.promote_offsets() index_columns = [new_index_col_id] self._index_columns = tuple(index_columns) @@ -114,6 +117,7 @@ def __init__( self._stats_cache: dict[str, dict[str, typing.Any]] = { col_id: {} for col_id in self.value_columns } + # TODO(kemppeterson) Add a cache for corr to parallel the single-column stats. @property def index(self) -> indexes.IndexValue: @@ -826,9 +830,7 @@ def aggregate_all_and_pivot( dtype=pd.Float64Dtype(), ) -> Block: aggregations = [(col_id, operation, col_id) for col_id in self.value_columns] - result_expr = self.expr.aggregate( - aggregations, dropna=dropna - ).unpivot_single_row( + result_expr = self.expr.aggregate(aggregations, dropna=dropna).unpivot( row_labels=self.column_labels.to_list(), index_col_id="index", unpivot_columns=[(value_col_id, self.value_columns)], @@ -966,6 +968,26 @@ def get_stat(self, column_id: str, stat: agg_ops.AggregateOp): self._stats_cache[column_id].update(stats_map) return stats_map[stat.name] + def get_corr_stat(self, column_id_left: str, column_id_right: str): + # TODO(kemppeterson): Clean up the column names for DataFrames.corr support + # TODO(kemppeterson): Add a cache here. + corr_aggregations = [ + ( + column_id_left, + column_id_right, + "corr_" + column_id_left + column_id_right, + ) + ] + expr = self.expr.corr_aggregate(corr_aggregations) + expr, offset_index_id = expr.promote_offsets() + block = Block( + expr, + index_columns=[offset_index_id], + column_labels=[a[2] for a in corr_aggregations], + ) + df, _ = block.to_pandas() + return df.loc[0, "corr_" + column_id_left + column_id_right] + def summarize( self, column_ids: typing.Sequence[str], @@ -983,7 +1005,7 @@ def summarize( (col_id, [f"{col_id}-{stat.name}" for stat in stats]) for col_id in column_ids ] - expr = self.expr.aggregate(aggregations).unpivot_single_row( + expr = self.expr.aggregate(aggregations).unpivot( labels, unpivot_columns=columns, index_col_id=label_col_id, @@ -1166,6 +1188,121 @@ def pivot( return result_block.with_column_labels(column_index) + def stack(self): + """Unpivot last column axis level into row axis""" + if isinstance(self.column_labels, pd.MultiIndex): + return self._stack_multi() + else: + return self._stack_mono() + + def _stack_mono(self): + if isinstance(self.column_labels, pd.MultiIndex): + raise ValueError("Expected single level index") + + # These are the values that will be turned into rows + stack_values = self.column_labels.drop_duplicates().sort_values() + + # Get matching columns + unpivot_columns: List[Tuple[str, List[str]]] = [] + dtypes: List[bigframes.dtypes.Dtype] = [] + col_id = guid.generate_guid("unpivot_") + dtype = None + input_columns: Sequence[Optional[str]] = [] + for uvalue in stack_values: + matching_ids = self.label_to_col_id.get(uvalue, []) + input_id = matching_ids[0] if len(matching_ids) > 0 else None + if input_id: + if dtype and dtype != self._column_type(input_id): + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + dtype = self._column_type(input_id) + input_columns.append(input_id) + unpivot_columns.append((col_id, input_columns)) + if dtype: + dtypes.append(dtype or pd.Float64Dtype()) + + added_index_column = col_id = guid.generate_guid() + unpivot_expr = self._expr.unpivot( + row_labels=stack_values, + passthrough_columns=self.index_columns, + unpivot_columns=unpivot_columns, + index_col_id=added_index_column, + dtype=dtypes, + ) + block = Block( + unpivot_expr, + index_columns=[*self.index_columns, added_index_column], + column_labels=[None], + index_labels=[*self._index_labels, self.column_labels.names[-1]], + ) + return block + + def _stack_multi(self): + if not isinstance(self.column_labels, pd.MultiIndex): + raise ValueError("Expected multi-index") + + # These are the values that will be turned into rows + stack_values = ( + self.column_labels.get_level_values(-1).drop_duplicates().sort_values() + ) + + result_col_labels = ( + self.column_labels.droplevel(-1) + .drop_duplicates() + .sort_values() + .dropna(how="all") + ) + + # Get matching columns + unpivot_columns: List[Tuple[str, List[str]]] = [] + dtypes = [] + for val in result_col_labels: + col_id = guid.generate_guid("unpivot_") + dtype = None + input_columns: Sequence[Optional[str]] = [] + for uvalue in stack_values: + # Need to unpack if still a multi-index after dropping 1 level + label_to_match = ( + (val, uvalue) if result_col_labels.nlevels == 1 else (*val, uvalue) + ) + matching_ids = self.label_to_col_id.get(label_to_match, []) + input_id = matching_ids[0] if len(matching_ids) > 0 else None + if input_id: + if dtype and dtype != self._column_type(input_id): + raise NotImplementedError( + "Cannot stack columns with non-matching dtypes." + ) + else: + dtype = self._column_type(input_id) + input_columns.append(input_id) + # Input column i is the first one that + unpivot_columns.append((col_id, input_columns)) + if dtype: + dtypes.append(dtype or pd.Float64Dtype()) + + added_index_column = col_id = guid.generate_guid() + unpivot_expr = self._expr.unpivot( + row_labels=stack_values, + passthrough_columns=self.index_columns, + unpivot_columns=unpivot_columns, + index_col_id=added_index_column, + dtype=dtypes, + ) + block = Block( + unpivot_expr, + index_columns=[*self.index_columns, added_index_column], + column_labels=result_col_labels, + index_labels=[*self._index_labels, self.column_labels.names[-1]], + ) + return block + + def _column_type(self, col_id: str) -> bigframes.dtypes.Dtype: + col_offset = self.value_columns.index(col_id) + dtype = self.dtypes[col_offset] + return dtype + @staticmethod def _create_pivot_column_index( value_labels: Sequence[typing.Hashable], columns_values: pd.Index diff --git a/bigframes/core/groupby/__init__.py b/bigframes/core/groupby/__init__.py index 5b217effdd..589c5c251c 100644 --- a/bigframes/core/groupby/__init__.py +++ b/bigframes/core/groupby/__init__.py @@ -197,8 +197,11 @@ def _agg_string(self, func: str) -> df.DataFrame: return df.DataFrame(agg_block) def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: - aggregations = [] + aggregations: typing.List[typing.Tuple[str, agg_ops.AggregateOp]] = [] column_labels = [] + + want_aggfunc_level = any(utils.is_list_like(aggs) for aggs in func.values()) + for label, funcs_for_id in func.items(): col_id = self._resolve_label(label) func_list = ( @@ -206,16 +209,22 @@ def _agg_dict(self, func: typing.Mapping) -> df.DataFrame: ) for f in func_list: aggregations.append((col_id, agg_ops.lookup_agg_func(f))) - column_labels.append((col_id, f)) + column_labels.append(label) agg_block, _ = self._block.aggregate( by_column_ids=self._by_col_ids, aggregations=aggregations, as_index=self._as_index, dropna=self._dropna, ) - agg_block = agg_block.with_column_labels( - pd.MultiIndex.from_tuples(column_labels) - ) + if want_aggfunc_level: + agg_block = agg_block.with_column_labels( + utils.combine_indices( + pd.Index(column_labels), + pd.Index(agg[1].name for agg in aggregations), + ) + ) + else: + agg_block = agg_block.with_column_labels(pd.Index(column_labels)) return df.DataFrame(agg_block) def _agg_list(self, func: typing.Sequence) -> df.DataFrame: @@ -234,7 +243,9 @@ def _agg_list(self, func: typing.Sequence) -> df.DataFrame: dropna=self._dropna, ) agg_block = agg_block.with_column_labels( - pd.MultiIndex.from_tuples(column_labels) + pd.MultiIndex.from_tuples( + column_labels, names=[*self._block.column_labels.names, None] + ) ) return df.DataFrame(agg_block) diff --git a/bigframes/core/indexes/__init__.py b/bigframes/core/indexes/__init__.py index d797c57955..184a9ce262 100644 --- a/bigframes/core/indexes/__init__.py +++ b/bigframes/core/indexes/__init__.py @@ -14,10 +14,7 @@ from bigframes.core.indexes.index import Index, IndexValue -INDEX_COLUMN_ID = "bigframes_index_{}" - __all__ = [ "Index", "IndexValue", - "INDEX_COLUMN_ID", ] diff --git a/bigframes/core/utils.py b/bigframes/core/utils.py index a330002905..1c0a2a1a81 100644 --- a/bigframes/core/utils.py +++ b/bigframes/core/utils.py @@ -12,10 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. import typing +from typing import Hashable, Iterable, List import pandas as pd import typing_extensions +import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common + +UNNAMED_COLUMN_ID = "bigframes_unnamed_column" +UNNAMED_INDEX_ID = "bigframes_unnamed_index" + def get_axis_number(axis: typing.Union[str, int, None]) -> typing.Literal[0, 1]: if axis in {0, "index", "rows", None}: @@ -31,3 +37,50 @@ def is_list_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Sequence def is_dict_like(obj: typing.Any) -> typing_extensions.TypeGuard[typing.Mapping]: return pd.api.types.is_dict_like(obj) + + +def combine_indices(index1: pd.Index, index2: pd.Index) -> pd.MultiIndex: + """Combines indices into multi-index while preserving dtypes, names.""" + multi_index = pd.MultiIndex.from_frame( + pd.concat([index1.to_frame(index=False), index2.to_frame(index=False)], axis=1) + ) + # to_frame will produce numbered default names, we don't want these + multi_index.names = [*index1.names, *index2.names] + return multi_index + + +def get_standardized_ids( + col_labels: Iterable[Hashable], idx_labels: Iterable[Hashable] = () +) -> tuple[list[str], list[str]]: + """Get stardardized column ids as column_ids_list, index_ids_list. + The standardized_column_id must be valid BQ SQL schema column names, can only be string type and unique. + + Args: + col_labels: column labels + + idx_labels: index labels, optional. If empty, will only return column ids. + + Return: + Tuple of (standardized_column_ids, standardized_index_ids) + """ + col_ids = [ + UNNAMED_COLUMN_ID if col_label is None else str(col_label) + for col_label in col_labels + ] + idx_ids = [ + UNNAMED_INDEX_ID if idx_label is None else str(idx_label) + for idx_label in idx_labels + ] + + ids = idx_ids + col_ids + # Column values will be loaded as null if the column name has spaces. + # https://github.com/googleapis/python-bigquery/issues/1566 + ids = [id.replace(" ", "_") for id in ids] + + ids = typing.cast( + List[str], + vendored_pandas_io_common.dedup_names(ids, is_potential_multiindex=False), + ) + idx_ids, col_ids = ids[: len(idx_ids)], ids[len(idx_ids) :] + + return col_ids, idx_ids diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index 5fbe5d1f9e..ef443db079 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -56,7 +56,6 @@ import bigframes.series import bigframes.series as bf_series import third_party.bigframes_vendored.pandas.core.frame as vendored_pandas_frame -import third_party.bigframes_vendored.pandas.io.common as vendored_pandas_io_common import third_party.bigframes_vendored.pandas.pandas._typing as vendored_pandas_typing if typing.TYPE_CHECKING: @@ -291,84 +290,59 @@ def astype( self, dtype: Union[bigframes.dtypes.DtypeString, bigframes.dtypes.Dtype], ) -> DataFrame: - return self._apply_to_rows(ops.AsTypeOp(dtype)) + return self._apply_unary_op(ops.AsTypeOp(dtype)) def _to_sql_query( - self, always_include_index: bool - ) -> Tuple[str, List[Tuple[str, bool]]]: + self, include_index: bool + ) -> Tuple[str, list[str], list[blocks.Label]]: """Compiles this DataFrame's expression tree to SQL, optionally - including unnamed index columns. + including index columns. Args: - always_include_index (bool): - whether to include unnamed index columns. If False, only named - indexes are included. + include_index (bool): + whether to include index columns. - Returns: a tuple of (sql_string, index_column_list) - Each entry in the index column list is a tuple of (column_name, named). - If named is false, then the column name exists only in SQL + Returns: + a tuple of (sql_string, index_column_id_list, index_column_label_list). + If include_index is set to False, index_column_id_list and index_column_label_list + return empty lists. """ # Has to be unordered as it is impossible to order the sql without # including metadata columns in selection with ibis. ibis_expr = self._block.expr.to_ibis_expr(ordering_mode="unordered") - column_labels = list(self._block.column_labels) + col_labels, idx_labels = list(self._block.column_labels), list( + self._block.index_labels + ) + old_col_ids, old_idx_ids = list(self._block.value_columns), list( + self._block.index_columns + ) - # TODO(swast): Need to have a better way of controlling when to include - # the index or not. - index_has_names = all([name is not None for name in self.index.names]) - if index_has_names: - column_labels = column_labels + list(self.index.names) - elif always_include_index: - # In this mode include the index even if it is a nameless generated - # column like 'bigframes_index_0' - index_labels = [] - unnamed_index_count = 0 - for index_label in self._block.index_labels: - if isinstance(index_label, str): - index_labels.append(index_label) - else: - index_labels.append( - indexes.INDEX_COLUMN_ID.format(unnamed_index_count), - ) - unnamed_index_count += 1 + if not include_index: + idx_labels, old_idx_ids = [], [] + ibis_expr = ibis_expr.drop(*self._block.index_columns) - column_labels = column_labels + typing.cast( - List[Optional[str]], index_labels - ) + old_ids = old_idx_ids + old_col_ids + + new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + new_ids = new_idx_ids + new_col_ids - column_labels_deduped = typing.cast( - List[str], - vendored_pandas_io_common.dedup_names( - column_labels, is_potential_multiindex=False - ), - ) - column_ids = self._block.value_columns substitutions = {} - for column_id, column_label in zip(column_ids, column_labels_deduped): + for old_id, new_id in zip(old_ids, new_ids): # TODO(swast): Do we need to further escape this, or can we rely on # the BigQuery unicode column name feature? - substitutions[column_id] = column_label - - index_cols: List[Tuple[str, bool]] = [] - first_index_offset = len(self._block.column_labels) - if index_has_names or always_include_index: - for i, index_col in enumerate(self._block.index_columns): - offset = first_index_offset + i - substitutions[index_col] = column_labels_deduped[offset] - index_cols = [ - (label, index_has_names) - for label in column_labels_deduped[first_index_offset:] - ] - else: - ibis_expr = ibis_expr.drop(*self._block.index_columns) + substitutions[old_id] = new_id ibis_expr = ibis_expr.relabel(substitutions) - return typing.cast(str, ibis_expr.compile()), index_cols + return ( + typing.cast(str, ibis_expr.compile()), + new_ids[: len(idx_labels)], + idx_labels, + ) @property def sql(self) -> str: """Compiles this DataFrame's expression tree to SQL.""" - sql, _ = self._to_sql_query(always_include_index=False) + sql, _, _ = self._to_sql_query(include_index=False) return sql @property @@ -469,12 +443,12 @@ def __getattr__(self, key: str): if key in self._block.column_labels: return self.__getitem__(key) elif hasattr(pandas.DataFrame, key): - raise NotImplementedError( + raise AttributeError( textwrap.dedent( f""" - BigQuery DataFrames has not yet implemented an equivalent to - 'pandas.DataFrame.{key}'. {constants.FEEDBACK_LINK} - """ + BigQuery DataFrames has not yet implemented an equivalent to + 'pandas.DataFrame.{key}'. {constants.FEEDBACK_LINK} + """ ) ) else: @@ -872,6 +846,32 @@ def _assign_single_item( copy = self.copy() copy[k] = v(copy) return copy + elif utils.is_list_like(v): + given_rows = len(v) + actual_rows = len(self) + if given_rows != actual_rows: + raise ValueError( + f"Length of values ({given_rows}) does not match length of index ({actual_rows})" + ) + + local_df = bigframes.dataframe.DataFrame( + {k: v}, session=self._get_block().expr._session + ) + # local_df is likely (but not guarunteed) to be cached locally + # since the original list came from memory and so is probably < MAX_INLINE_DF_SIZE + + this_expr, this_offsets_col_id = self._get_block()._expr.promote_offsets() + block = blocks.Block( + expr=this_expr, + index_labels=self.index.names, + index_columns=self._block.index_columns, + column_labels=[this_offsets_col_id] + list(self._block.value_columns), + ) # offsets are temporarily the first value column, label set to id + this_df_with_offsets = DataFrame(data=block) + join_result = this_df_with_offsets.join( + other=local_df, on=this_offsets_col_id, how="left" + ) + return join_result.drop(columns=[this_offsets_col_id]) else: return self._assign_scalar(k, v) @@ -1024,13 +1024,7 @@ def add_suffix(self, suffix: str, axis: int | str | None = None) -> DataFrame: return DataFrame(self._get_block().add_suffix(suffix, axis)) def dropna(self) -> DataFrame: - block = self._block - for column in self._block.value_columns: - block, result_id = block.apply_unary_op(column, ops.notnull_op) - block = block.filter(result_id) - block = block.drop_columns([result_id]) - - return DataFrame(block) + return DataFrame(block_ops.dropna(self._block, how="any")) def any( self, @@ -1209,6 +1203,14 @@ def pivot( ) return DataFrame(pivot_block) + def stack(self): + # TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack. + # TODO: support 'dropna' param by executing dropna only conditionally + result_block = block_ops.dropna(self._block.stack(), how="all") + if not isinstance(self.columns, pandas.MultiIndex): + return bigframes.series.Series(result_block) + return DataFrame(result_block) + def _drop_non_numeric(self, keep_bool=True) -> DataFrame: types_to_keep = set(bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES) if not keep_bool: @@ -1508,15 +1510,15 @@ def _groupby_series( ) def abs(self) -> DataFrame: - return self._apply_to_rows(ops.abs_op) + return self._apply_unary_op(ops.abs_op) def isna(self) -> DataFrame: - return self._apply_to_rows(ops.isnull_op) + return self._apply_unary_op(ops.isnull_op) isnull = isna def notna(self) -> DataFrame: - return self._apply_to_rows(ops.notnull_op) + return self._apply_unary_op(ops.notnull_op) notnull = notna @@ -1736,7 +1738,7 @@ def to_parquet(self, path: str, *, index: bool = True) -> None: _, query_job = self._block.expr._session._start_query(export_data_statement) self._set_internal_query_job(query_job) - def _apply_to_rows(self, operation: ops.UnaryOp): + def _apply_unary_op(self, operation: ops.UnaryOp) -> DataFrame: block = self._block.multi_apply_unary_op(self._block.value_columns, operation) return DataFrame(block) @@ -1813,7 +1815,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: # to be applied before passing data to remote function, protecting from bad # inputs causing errors. reprojected_df = DataFrame(self._block._force_reproject()) - return reprojected_df._apply_to_rows( + return reprojected_df._apply_unary_op( ops.RemoteFunctionOp(func, apply_on_null=(na_action is None)) ) @@ -1871,6 +1873,25 @@ def _slice( block = self._block.slice(start=start, stop=stop, step=step) return DataFrame(block) + def __array_ufunc__( + self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs + ) -> DataFrame: + """Used to support numpy ufuncs. + See: https://numpy.org/doc/stable/reference/ufuncs.html + """ + if ( + inputs[0] is not self + or method != "__call__" + or len(inputs) > 1 + or len(kwargs) > 0 + ): + return NotImplemented + + if ufunc in ops.NUMPY_TO_OP: + return self._apply_unary_op(ops.NUMPY_TO_OP[ufunc]) + + return NotImplemented + def _set_block(self, block: blocks.Block): self._block = block diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 812bb08dc3..27727c9f81 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -74,20 +74,18 @@ def _apply_sql( string from which to construct the output dataframe. It must include the index columns of the input SQL. """ - source_sql, tagged_index_cols = input_data._to_sql_query( - always_include_index=True + source_sql, index_col_ids, index_labels = input_data._to_sql_query( + include_index=True ) - if len(tagged_index_cols) != 1: + if len(index_col_ids) != 1: raise NotImplementedError( f"Only exactly one index column is supported. {constants.FEEDBACK_LINK}" ) - index_col_name, is_named_index = tagged_index_cols[0] sql = func(source_sql) - df = session.read_gbq(sql, index_col=[index_col_name]) - if not is_named_index: - df.index.name = None + df = session.read_gbq(sql, index_col=index_col_ids) + df.index.names = index_labels return df @@ -150,10 +148,10 @@ def forecast(self) -> bpd.DataFrame: def evaluate(self, input_data: Optional[bpd.DataFrame] = None): # TODO: validate input data schema # Note: don't need index as evaluate returns a new table - source_sql, _ = ( - input_data._to_sql_query(always_include_index=False) + source_sql, _, _ = ( + input_data._to_sql_query(include_index=False) if (input_data is not None) - else (None, None) + else (None, None, None) ) sql = ml_sql.ml_evaluate(self.model_name, source_sql) @@ -166,6 +164,20 @@ def centroids(self): return self._session.read_gbq(sql) + def principal_components(self): + assert self._model.model_type == "PCA" + + sql = ml_sql.ml_principal_components(self.model_name) + + return self._session.read_gbq(sql) + + def principal_component_info(self): + assert self._model.model_type == "PCA" + + sql = ml_sql.ml_principal_component_info(self.model_name) + + return self._session.read_gbq(sql) + def copy(self, new_model_name: str, replace: bool = False) -> BqmlModel: job_config = bigquery.job.CopyJobConfig() if replace: diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 76b4f9ced6..16106d3a7b 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -70,6 +70,35 @@ def fit( ) return self + @property + def components_(self) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError("A model must be fitted before calling components_.") + + return self._bqml_model.principal_components() + + @property + def explained_variance_(self) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError( + "A model must be fitted before calling explained_variance_." + ) + + return self._bqml_model.principal_component_info()[ + ["principal_component_id", "eigenvalue"] + ].rename(columns={"eigenvalue": "explained_variance"}) + + @property + def explained_variance_ratio_(self) -> bpd.DataFrame: + if not self._bqml_model: + raise RuntimeError( + "A model must be fitted before calling explained_variance_ratio_." + ) + + return self._bqml_model.principal_component_info()[ + ["principal_component_id", "explained_variance_ratio"] + ] + def predict(self, X: Union[bpd.DataFrame, bpd.Series]) -> bpd.DataFrame: if not self._bqml_model: raise RuntimeError("A model must be fitted before predict") diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index 80054d40e1..bcd8243582 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -182,3 +182,13 @@ def ml_generate_text_embedding( def ml_forecast(model_name: str) -> str: """Encode ML.FORECAST for BQML""" return f"""SELECT * FROM ML.FORECAST(MODEL `{model_name}`)""" + + +def ml_principal_components(model_name: str) -> str: + """Encode ML.PRINCIPAL_COMPONENTS for BQML""" + return f"""SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `{model_name}`)""" + + +def ml_principal_component_info(model_name: str) -> str: + """Encode ML.PRINCIPAL_COMPONENT_INFO for BQML""" + return f"""SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `{model_name}`)""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 58f19ea8e7..9305cf1dda 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -32,6 +32,12 @@ _ZERO = typing.cast(ibis_types.NumericValue, ibis_types.literal(0)) _NAN = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.nan)) _INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(np.inf)) +_NEG_INF = typing.cast(ibis_types.NumericValue, ibis_types.literal(-np.inf)) + +# Approx Highest number you can pass in to EXP function and get a valid FLOAT64 result +# FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) +# ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. +_FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) BinaryOp = typing.Callable[[ibis_types.Value, ibis_types.Value], ibis_types.Value] TernaryOp = typing.Callable[ @@ -51,11 +57,142 @@ def is_windowed(self): return False +# Trig Functions class AbsOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).abs() +class SinOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).sin() + + +class CosOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).cos() + + +class TanOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).tan() + + +# Inverse trig functions +class ArcsinOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.asin()) + + +class ArccosOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() <= _ibis_num(1) + return (~domain).ifelse(_NAN, numeric_value.acos()) + + +class ArctanOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + return typing.cast(ibis_types.NumericValue, x).atan() + + +# Hyperbolic trig functions +# BQ has these functions, but Ibis doesn't +class SinhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sinh_result = ( + numeric_value.exp() - (numeric_value.negate()).exp() + ) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF * numeric_value.sign(), sinh_result) + + +class CoshOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + cosh_result = ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) / _ibis_num(2) + domain = numeric_value.abs() < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, cosh_result) + + +class TanhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + tanh_result = (numeric_value.exp() - (numeric_value.negate()).exp()) / ( + numeric_value.exp() + (numeric_value.negate()).exp() + ) + # Beyond +-20, is effectively just the sign function + domain = numeric_value.abs() < _ibis_num(20) + return (~domain).ifelse(numeric_value.sign(), tanh_result) + + +class ArcsinhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) + _ibis_num(1)).sqrt() + return (numeric_value.abs() + sqrt_part).ln() * numeric_value.sign() + + +class ArccoshOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + sqrt_part = ((numeric_value * numeric_value) - _ibis_num(1)).sqrt() + acosh_result = (numeric_value + sqrt_part).ln() + domain = numeric_value >= _ibis_num(1) + return (~domain).ifelse(_NAN, acosh_result) + + +class ArctanhOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value.abs() < _ibis_num(1) + numerator = numeric_value + _ibis_num(1) + denominator = _ibis_num(1) - numeric_value + ln_input = typing.cast(ibis_types.NumericValue, numerator.div(denominator)) + atanh_result = ln_input.ln().div(2) + + out_of_domain = (numeric_value.abs() == _ibis_num(1)).ifelse( + _INF * numeric_value, _NAN + ) + + return (~domain).ifelse(out_of_domain, atanh_result) + + +class SqrtOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value >= _ZERO + return (~domain).ifelse(_NAN, numeric_value.sqrt()) + + +class Log10Op(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.log10()) + + +class LnOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value > _ZERO + out_of_domain = (numeric_value == _ZERO).ifelse(_NEG_INF, _NAN) + return (~domain).ifelse(out_of_domain, numeric_value.ln()) + + +class ExpOp(UnaryOp): + def _as_ibis(self, x: ibis_types.Value): + numeric_value = typing.cast(ibis_types.NumericValue, x) + domain = numeric_value < _FLOAT64_EXP_BOUND + return (~domain).ifelse(_INF, numeric_value.exp()) + + class InvertOp(UnaryOp): def _as_ibis(self, x: ibis_types.Value): return typing.cast(ibis_types.NumericValue, x).negate() @@ -484,6 +621,28 @@ def _as_ibis(self, x: ibis_types.Value): year_op = YearOp() capitalize_op = CapitalizeOp() +# Just parameterless unary ops for now +# TODO: Parameter mappings +NUMPY_TO_OP: typing.Final = { + np.sin: SinOp(), + np.cos: CosOp(), + np.tan: TanOp(), + np.arcsin: ArcsinOp(), + np.arccos: ArccosOp(), + np.arctan: ArctanOp(), + np.sinh: SinhOp(), + np.cosh: CoshOp(), + np.tanh: TanhOp(), + np.arcsinh: ArcsinhOp(), + np.arccosh: ArccoshOp(), + np.arctanh: ArctanhOp(), + np.exp: ExpOp(), + np.log: LnOp(), + np.log10: Log10Op(), + np.sqrt: SqrtOp(), + np.abs: AbsOp(), +} + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): @@ -785,3 +944,7 @@ def partial_arg3(op: TernaryOp, scalar: typing.Any) -> BinaryOp: def is_null(value) -> bool: # float NaN/inf should be treated as distinct from 'true' null values return typing.cast(bool, pd.isna(value)) and not isinstance(value, float) + + +def _ibis_num(number: float): + return typing.cast(ibis_types.NumericValue, ibis_types.literal(number)) diff --git a/bigframes/operations/base.py b/bigframes/operations/base.py index 361fdca055..81a5bc4c41 100644 --- a/bigframes/operations/base.py +++ b/bigframes/operations/base.py @@ -167,6 +167,11 @@ def _apply_binary_op( partial_op = ops.BinopPartialRight(op, other) return self._apply_unary_op(partial_op) + def _apply_corr_aggregation(self, other: series.Series) -> float: + (left, right, block) = self._align(other, how="outer") + + return block.get_corr_stat(left, right) + def _align(self, other: series.Series, how="outer") -> tuple[str, str, blocks.Block]: # type: ignore """Aligns the series value with another scalar or series object. Returns new left column id, right column id and joined tabled expression.""" values, block = self._align_n( diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index b688c18723..280fce1112 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -148,13 +148,15 @@ def _set_default_session_location_if_possible(query): ): return - bqclient, _, _, _ = bigframes.session._create_cloud_clients( + clients_provider = bigframes.session.ClientsProvider( project=options.bigquery.project, location=options.bigquery.location, use_regional_endpoints=options.bigquery.use_regional_endpoints, credentials=options.bigquery.credentials, ) + bqclient = clients_provider.bqclient + if bigframes.session._is_query(query): job = bqclient.query(query, bigquery.QueryJobConfig(dry_run=True)) options.bigquery.location = job.location @@ -226,6 +228,33 @@ def read_csv( read_csv.__doc__ = inspect.getdoc(bigframes.session.Session.read_csv) +def read_json( + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, +) -> bigframes.dataframe.DataFrame: + return global_session.with_default_session( + bigframes.session.Session.read_json, + path_or_buf=path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + engine=engine, + **kwargs, + ) + + +read_json.__doc__ = inspect.getdoc(bigframes.session.Session.read_json) + + def read_gbq( query: str, *, diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 7cf74d6311..2a4b919dab 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -34,7 +34,14 @@ import cloudpickle import google.api_core.exceptions -from google.cloud import bigquery, bigquery_connection_v1, functions_v2 +import google.api_core.retry +from google.cloud import ( + bigquery, + bigquery_connection_v1, + functions_v2, + resourcemanager_v3, +) +import google.iam.v1 from ibis.backends.bigquery.compiler import compiles from ibis.backends.bigquery.datatypes import BigQueryType from ibis.expr.datatypes.core import DataType as IbisDataType @@ -152,6 +159,7 @@ def __init__( bq_client, bq_connection_client, bq_connection_id, + cloud_resource_manager_client, ): self._gcp_project_id = gcp_project_id self._cloud_function_region = cloud_function_region @@ -161,6 +169,7 @@ def __init__( self._bq_client = bq_client self._bq_connection_client = bq_connection_client self._bq_connection_id = bq_connection_id + self._cloud_resource_manager_client = cloud_resource_manager_client def create_bq_remote_function( self, input_args, input_types, output_type, endpoint, bq_function_name @@ -175,7 +184,8 @@ def create_bq_remote_function( # raise ValueError("Failed to enable BigQuery Connection API") # If the intended connection does not exist then create it - if self.check_bq_connection_exists(): + service_account_id = self.get_service_account_if_connection_exists() + if service_account_id: logger.info(f"Connector {self._bq_connection_id} already exists") else: connection_name, service_account_id = self.create_bq_connection() @@ -183,21 +193,9 @@ def create_bq_remote_function( f"Created BQ connection {connection_name} with service account id: {service_account_id}" ) - # Set up access on the newly created BQ connection - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - # We would explicitly wait for 60+ seconds for the IAM binding to take effect - command_iam = ( - f"gcloud projects add-iam-policy-binding {self._gcp_project_id}" - + f' --member="serviceAccount:{service_account_id}"' - + ' --role="roles/run.invoker"' - ) - logger.info(f"Setting up IAM binding on the BQ connection: {command_iam}") - _run_system_command(command_iam) - - logger.info( - f"Waiting {self._iam_wait_seconds} seconds for IAM to take effect.." - ) - time.sleep(self._iam_wait_seconds) + # Ensure IAM role on the BQ connection + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + self._ensure_iam_binding(service_account_id, "run.invoker") # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 @@ -239,6 +237,53 @@ def get_cloud_function_endpoint(self, name): pass return None + # Introduce retries to accommodate transient errors like etag mismatch, + # which can be caused by concurrent operation on the same resource, and + # manifests with message like: + # google.api_core.exceptions.Aborted: 409 There were concurrent policy + # changes. Please retry the whole read-modify-write with exponential + # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match + # the current policy's ETag '\007\006\003,\3750&\363'. + @google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type( + google.api_core.exceptions.Aborted + ), + initial=10, + maximum=20, + multiplier=2, + timeout=60, + ) + def _ensure_iam_binding(self, service_account: str, role: str): + """Ensure necessary IAM role is configured on a service account.""" + project = f"projects/{self._gcp_project_id}" + service_account = f"serviceAccount:{service_account}" + role = f"roles/{role}" + request = google.iam.v1.iam_policy_pb2.GetIamPolicyRequest(resource=project) + policy = self._cloud_resource_manager_client.get_iam_policy(request=request) + + # Check if the binding already exists, and if does, do nothing more + for binding in policy.bindings: + if binding.role == role: + if service_account in binding.members: + return + + # Create a new binding + new_binding = google.iam.v1.policy_pb2.Binding( + role=role, members=[service_account] + ) + policy.bindings.append(new_binding) + request = google.iam.v1.iam_policy_pb2.SetIamPolicyRequest( + resource=project, policy=policy + ) + self._cloud_resource_manager_client.set_iam_policy(request=request) + + # We would wait for the IAM policy change to take effect + # https://cloud.google.com/iam/docs/access-change-propagation + logger.info( + f"Waiting {self._iam_wait_seconds} seconds for IAM to take effect.." + ) + time.sleep(self._iam_wait_seconds) + def create_bq_connection(self): """Create the BigQuery Connection and returns corresponding service account id.""" client = self._bq_connection_client @@ -253,7 +298,7 @@ def create_bq_connection(self): connection = client.create_connection(request) return connection.name, connection.cloud_resource.service_account_id - def check_bq_connection_exists(self): + def get_service_account_if_connection_exists(self) -> Optional[str]: """Check if the BigQuery Connection exists.""" client = self._bq_connection_client request = bigquery_connection_v1.GetConnectionRequest( @@ -262,12 +307,15 @@ def check_bq_connection_exists(self): ) ) + service_account = None try: - client.get_connection(request=request) - return True + service_account = client.get_connection( + request=request + ).cloud_resource.service_account_id except google.api_core.exceptions.NotFound: pass - return False + + return service_account def generate_udf_code(self, def_, dir): """Generate serialized bytecode using cloudpickle given a udf.""" @@ -624,6 +672,7 @@ def remote_function( bigquery_connection_v1.ConnectionServiceClient ] = None, cloud_functions_client: Optional[functions_v2.FunctionServiceClient] = None, + resource_manager_client: Optional[resourcemanager_v3.ProjectsClient] = None, dataset: Optional[str] = None, bigquery_connection: Optional[str] = None, reuse: bool = True, @@ -688,6 +737,11 @@ def remote_function( Client to use for BigQuery connection operations. If this param is not provided then bigquery connection client from the session would be used. + resource_manager_client (google.cloud.resourcemanager_v3.ProjectsClient, Optional): + Client to use for cloud resource management operations, e.g. for + getting and setting IAM roles on cloud resources. If this param is + not provided then resource manager client from the session would be + used. dataset (str, Optional.): Dataset in which to create a BigQuery remote function. It should be in `.` or `` format. If this @@ -734,7 +788,17 @@ def remote_function( cloud_functions_client = session.cloudfunctionsclient if not cloud_functions_client: raise ValueError( - "A functions connection client must be provided, either directly or via session. " + "A cloud functions client must be provided, either directly or via session. " + f"{constants.FEEDBACK_LINK}" + ) + + # A resource manager client is required to get/set IAM operations + if not resource_manager_client: + if session: + resource_manager_client = session.resourcemanagerclient + if not resource_manager_client: + raise ValueError( + "A resource manager client must be provided, either directly or via session. " f"{constants.FEEDBACK_LINK}" ) @@ -819,6 +883,7 @@ def wrapper(f): bigquery_client, bigquery_connection_client, bigquery_connection, + resource_manager_client, ) rf_name, cf_name = remote_function_client.provision_bq_remote_function( f, ibis_signature.input_types, ibis_signature.output_type, uniq_suffix diff --git a/bigframes/series.py b/bigframes/series.py index a1da93dee3..49b0a5b1f0 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -609,6 +609,39 @@ def round_op(x: ibis_types.Value, y: ibis_types.Value): return self._apply_binary_op(decimals, round_op) + def corr(self, other: Series, method="pearson", min_periods=None) -> float: + """ + Compute the correlation with the other Series. Non-number values are ignored in the + computation. + + Uses the "Pearson" method of correlation. Numbers are converted to float before + calculation, so the result may be unstable. + + Args: + other (Series): + The series with which this is to be correlated. + method (string, default "pearson"): + Correlation method to use - currently only "pearson" is supported. + min_periods (int, default None): + The minimum number of observations needed to return a result. Non-default values + are not yet supported, so a result will be returned for at least two observations. + + Returns: + float; Will return NaN if there are fewer than two numeric pairs, either series has a + variance or covariance of zero, or any input value is infinite. + """ + # TODO(kemppeterson): Validate early that both are numeric + # TODO(kemppeterson): Handle partially-numeric columns + if method != "pearson": + raise NotImplementedError( + f"Only Pearson correlation is currently supported. {constants.FEEDBACK_LINK}" + ) + if min_periods: + raise NotImplementedError( + f"min_periods not yet supported. {constants.FEEDBACK_LINK}" + ) + return self._apply_corr_aggregation(other) + def all(self) -> bool: return typing.cast(bool, self._apply_aggregation(agg_ops.all_op)) @@ -851,7 +884,7 @@ def __getitem__(self, indexer): def __getattr__(self, key: str): if hasattr(pandas.Series, key): - raise NotImplementedError( + raise AttributeError( textwrap.dedent( f""" BigQuery DataFrames has not yet implemented an equivalent to @@ -1158,6 +1191,26 @@ def to_string( def to_xarray(self): return self.to_pandas().to_xarray() + def __array_ufunc__( + self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs + ) -> Series: + """Used to support numpy ufuncs. + See: https://numpy.org/doc/stable/reference/ufuncs.html + """ + # Only __call__ supported with zero arguments + if ( + inputs[0] is not self + or method != "__call__" + or len(inputs) > 1 + or len(kwargs) > 0 + ): + return NotImplemented + + if ufunc in ops.NUMPY_TO_OP: + return self._apply_unary_op(ops.NUMPY_TO_OP[ufunc]) + + return NotImplemented + # Keep this at the bottom of the Series class to avoid # confusing type checker by overriding str @property diff --git a/bigframes/session.py b/bigframes/session.py index 3ef5250746..ac2f8fa53a 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -47,6 +47,7 @@ import google.cloud.bigquery_connection_v1 import google.cloud.bigquery_storage_v1 import google.cloud.functions_v2 +import google.cloud.resourcemanager_v3 import google.cloud.storage as storage # type: ignore import ibis import ibis.backends.bigquery as ibis_bigquery @@ -69,6 +70,7 @@ import bigframes.core.guid as guid import bigframes.core.io as bigframes_io from bigframes.core.ordering import IntegerEncoding, OrderingColumnReference +import bigframes.core.utils as utils import bigframes.dataframe as dataframe import bigframes.formatting_helpers as formatting_helpers from bigframes.remote_function import read_gbq_function as bigframes_rgf @@ -99,6 +101,16 @@ # TODO(swast): Need to connect to regional endpoints when performing remote # functions operations (BQ Connection IAM, Cloud Run / Cloud Functions). +# Also see if resource manager client library supports regional endpoints. + +_VALID_ENCODINGS = { + "UTF-8", + "ISO-8859-1", + "UTF-16BE", + "UTF-16LE", + "UTF-32BE", + "UTF-32LE", +} logger = logging.getLogger(__name__) @@ -112,90 +124,143 @@ def _get_default_credentials_with_project(): return pydata_google_auth.default(scopes=_SCOPES, use_local_webserver=False) -def _create_cloud_clients( - project: Optional[str], - location: Optional[str], - use_regional_endpoints: Optional[bool], - credentials: Optional[google.auth.credentials.Credentials], -) -> typing.Tuple[ - bigquery.Client, - google.cloud.bigquery_connection_v1.ConnectionServiceClient, - google.cloud.bigquery_storage_v1.BigQueryReadClient, - google.cloud.functions_v2.FunctionServiceClient, -]: - """Create and initialize BigQuery client objects.""" - - credentials_project = None - if credentials is None: - credentials, credentials_project = _get_default_credentials_with_project() - - # Prefer the project in this order: - # 1. Project explicitly specified by the user - # 2. Project set in the environment - # 3. Project associated with the default credentials - project = ( - project - or os.getenv(_ENV_DEFAULT_PROJECT) - or typing.cast(Optional[str], credentials_project) - ) +class ClientsProvider: + """Provides client instances necessary to perform cloud operations.""" - if not project: - raise ValueError( - "Project must be set to initialize BigQuery client. " - "Try setting `bigframes.options.bigquery.project` first." + def __init__( + self, + project: Optional[str], + location: Optional[str], + use_regional_endpoints: Optional[bool], + credentials: Optional[google.auth.credentials.Credentials], + ): + credentials_project = None + if credentials is None: + credentials, credentials_project = _get_default_credentials_with_project() + + # Prefer the project in this order: + # 1. Project explicitly specified by the user + # 2. Project set in the environment + # 3. Project associated with the default credentials + project = ( + project + or os.getenv(_ENV_DEFAULT_PROJECT) + or typing.cast(Optional[str], credentials_project) ) - if use_regional_endpoints: - bq_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format(location=location), - ) - bqstorage_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format(location=location) - ) - bqconnection_options = google.api_core.client_options.ClientOptions( - api_endpoint=_BIGQUERYCONNECTION_REGIONAL_ENDPOINT.format(location=location) - ) - else: - bq_options = None - bqstorage_options = None - bqconnection_options = None - - bq_info = google.api_core.client_info.ClientInfo(user_agent=_APPLICATION_NAME) - bqclient = bigquery.Client( - client_info=bq_info, - client_options=bq_options, - credentials=credentials, - project=project, - location=location, - ) + if not project: + raise ValueError( + "Project must be set to initialize BigQuery client. " + "Try setting `bigframes.options.bigquery.project` first." + ) - bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=_APPLICATION_NAME - ) - bqconnectionclient = google.cloud.bigquery_connection_v1.ConnectionServiceClient( - client_info=bqconnection_info, - client_options=bqconnection_options, - credentials=credentials, - ) + self._project = project + self._location = location + self._use_regional_endpoints = use_regional_endpoints + self._credentials = credentials - bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=_APPLICATION_NAME - ) - bqstorageclient = google.cloud.bigquery_storage_v1.BigQueryReadClient( - client_info=bqstorage_info, - client_options=bqstorage_options, - credentials=credentials, - ) + # cloud clients initialized for lazy load + self._bqclient = None + self._bqconnectionclient = None + self._bqstorageclient = None + self._cloudfunctionsclient = None + self._resourcemanagerclient = None - functions_info = google.api_core.gapic_v1.client_info.ClientInfo( - user_agent=_APPLICATION_NAME - ) - cloudfunctionsclient = google.cloud.functions_v2.FunctionServiceClient( - client_info=functions_info, - credentials=credentials, - ) + @property + def bqclient(self): + if not self._bqclient: + bq_options = None + if self._use_regional_endpoints: + bq_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERY_REGIONAL_ENDPOINT.format( + location=self._location + ), + ) + bq_info = google.api_core.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._bqclient = bigquery.Client( + client_info=bq_info, + client_options=bq_options, + credentials=self._credentials, + project=self._project, + location=self._location, + ) - return bqclient, bqconnectionclient, bqstorageclient, cloudfunctionsclient + return self._bqclient + + @property + def bqconnectionclient(self): + if not self._bqconnectionclient: + bqconnection_options = None + if self._use_regional_endpoints: + bqconnection_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYCONNECTION_REGIONAL_ENDPOINT.format( + location=self._location + ) + ) + bqconnection_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._bqconnectionclient = ( + google.cloud.bigquery_connection_v1.ConnectionServiceClient( + client_info=bqconnection_info, + client_options=bqconnection_options, + credentials=self._credentials, + ) + ) + + return self._bqconnectionclient + + @property + def bqstorageclient(self): + if not self._bqstorageclient: + bqstorage_options = None + if self._use_regional_endpoints: + bqstorage_options = google.api_core.client_options.ClientOptions( + api_endpoint=_BIGQUERYSTORAGE_REGIONAL_ENDPOINT.format( + location=self._location + ) + ) + bqstorage_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._bqstorageclient = google.cloud.bigquery_storage_v1.BigQueryReadClient( + client_info=bqstorage_info, + client_options=bqstorage_options, + credentials=self._credentials, + ) + + return self._bqstorageclient + + @property + def cloudfunctionsclient(self): + if not self._cloudfunctionsclient: + functions_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._cloudfunctionsclient = ( + google.cloud.functions_v2.FunctionServiceClient( + client_info=functions_info, + credentials=self._credentials, + ) + ) + + return self._cloudfunctionsclient + + @property + def resourcemanagerclient(self): + if not self._resourcemanagerclient: + resourcemanager_info = google.api_core.gapic_v1.client_info.ClientInfo( + user_agent=_APPLICATION_NAME + ) + self._resourcemanagerclient = ( + google.cloud.resourcemanager_v3.ProjectsClient( + credentials=self._credentials, client_info=resourcemanager_info + ) + ) + + return self._resourcemanagerclient class Session( @@ -221,12 +286,9 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): else: self._location = context.location - ( - self.bqclient, - self.bqconnectionclient, - self.bqstorageclient, - self.cloudfunctionsclient, - ) = _create_cloud_clients( + # Instantiate a clients provider to help with cloud clients that will be + # used in the future operations in the session + self._clients_provider = ClientsProvider( project=context.project, location=self._location, use_regional_endpoints=context.use_regional_endpoints, @@ -249,6 +311,26 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): # changed. context._session_started = True + @property + def bqclient(self): + return self._clients_provider.bqclient + + @property + def bqconnectionclient(self): + return self._clients_provider.bqconnectionclient + + @property + def bqstorageclient(self): + return self._clients_provider.bqstorageclient + + @property + def cloudfunctionsclient(self): + return self._clients_provider.cloudfunctionsclient + + @property + def resourcemanagerclient(self): + return self._clients_provider.resourcemanagerclient + @property def _session_dataset_id(self): """A dataset for storing temporary objects local to the session @@ -343,6 +425,38 @@ def read_gbq( max_results=max_results, ) + def _query_to_destination( + self, query: str, index_cols: List[str] + ) -> Tuple[Optional[bigquery.TableReference], Optional[bigquery.QueryJob]]: + # If there are no index columns, then there's no reason to cache to a + # (clustered) session table, as we'll just have to query it again to + # create a default index & ordering. + if not index_cols: + _, query_job = self._start_query(query) + return query_job.destination, query_job + + # If a dry_run indicates this is not a query type job, then don't + # bother trying to do a CREATE TEMP TABLE ... AS SELECT ... statement. + dry_run_config = bigquery.QueryJobConfig() + dry_run_config.dry_run = True + _, dry_run_job = self._start_query(query, job_config=dry_run_config) + if dry_run_job.statement_type != "SELECT": + _, query_job = self._start_query(query) + return query_job.destination, query_job + + # Make sure we cluster by the index column(s) so that subsequent + # operations are as speedy as they can be. + try: + ibis_expr = self.ibis_client.sql(query) + return self._ibis_to_session_table(ibis_expr, index_cols), None + except google.api_core.exceptions.BadRequest: + # Some SELECT statements still aren't compatible with CREATE TEMP + # TABLE ... AS SELECT ... statements. For example, if the query has + # a top-level ORDER BY, this conflicts with our ability to cluster + # the table by the index column(s). + _, query_job = self._start_query(query) + return query_job.destination, query_job + def read_gbq_query( self, query: str, @@ -368,16 +482,7 @@ def read_gbq_query( else: index_cols = list(index_col) - # Make sure we cluster by the index column so that subsequent - # operations are as speedy as they can be. - if index_cols: - # Since index_cols are specified, assume that we have a normal SQL - # query. DDL or DML not supported. - ibis_expr = self.ibis_client.sql(query) - destination = self._ibis_to_session_table(ibis_expr, index_cols) - else: - _, query_job = self._start_query(query) - destination = query_job.destination + destination, query_job = self._query_to_destination(query, index_cols) # If there was no destination table, that means the query must have # been DDL or DML. Return some job metadata, instead. @@ -385,9 +490,11 @@ def read_gbq_query( return dataframe.DataFrame( data=pandas.DataFrame( { - "statement_type": [query_job.statement_type], - "job_id": [query_job.job_id], - "location": [query_job.location], + "statement_type": [ + query_job.statement_type if query_job else "unknown" + ], + "job_id": [query_job.job_id if query_job else "unknown"], + "location": [query_job.location if query_job else "unknown"], } ), session=self, @@ -551,9 +658,10 @@ def _read_gbq_with_ordering( table_expression: ibis_types.Table, *, col_order: Iterable[str] = (), - index_cols: Sequence[str] = (), - index_labels: Sequence[Optional[str]] = (), - hidden_cols: Sequence[str] = (), + col_labels: Iterable[Optional[str]] = (), + index_cols: Iterable[str] = (), + index_labels: Iterable[Optional[str]] = (), + hidden_cols: Iterable[str] = (), ordering: core.ExpressionOrdering, is_total_ordering: bool = False, ) -> dataframe.DataFrame: @@ -563,9 +671,13 @@ def _read_gbq_with_ordering( table_expression: an ibis table expression to be executed in BigQuery. col_order: - List of BigQuery column names in the desired order for results DataFrame. + List of BigQuery column ids in the desired order for results DataFrame. + col_labels: + List of column labels as the column names. index_cols: - List of column names to use as the index or multi-index. + List of index ids to use as the index or multi-index. + index_labels: + List of index labels as names of index. hidden_cols: Columns that should be hidden. Ordering columns may (not always) be hidden ordering: @@ -574,6 +686,7 @@ def _read_gbq_with_ordering( Returns: A DataFrame representing results of the query or table. """ + index_cols, index_labels = list(index_cols), list(index_labels) if len(index_cols) != len(index_labels): raise ValueError( "Needs same number of index labels are there are index columns. " @@ -597,11 +710,14 @@ def _read_gbq_with_ordering( table_expression, index_cols ) index_col_values = [table_expression[index_id] for index_id in index_cols] + if not col_labels: + col_labels = column_keys return self._read_ibis( table_expression, index_col_values, index_labels, column_keys, + col_labels, ordering=ordering, ) @@ -650,9 +766,10 @@ def _read_bigquery_load_job( def _read_ibis( self, table_expression: ibis_types.Table, - index_cols: Sequence[ibis_types.Value], - index_labels: Sequence[Optional[str]], - column_keys: Sequence[str], + index_cols: Iterable[ibis_types.Value], + index_labels: Iterable[blocks.Label], + column_keys: Iterable[str], + column_labels: Iterable[blocks.Label], ordering: core.ExpressionOrdering, ) -> dataframe.DataFrame: """Turns a table expression (plus index column) into a DataFrame.""" @@ -674,7 +791,7 @@ def _read_ibis( self, table_expression, columns, hidden_ordering_columns, ordering ), index_columns=[index_col.get_name() for index_col in index_cols], - column_labels=column_keys, + column_labels=column_labels, index_labels=index_labels, ) @@ -713,15 +830,23 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame Returns: bigframes.dataframe.DataFrame: The BigQuery DataFrame. """ + col_labels, idx_labels = ( + pandas_dataframe.columns.to_list(), + pandas_dataframe.index.names, + ) + new_col_ids, new_idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + # Add order column to pandas DataFrame to preserve order in BigQuery ordering_col = "rowid" - columns = frozenset(pandas_dataframe.columns) + columns = frozenset(col_labels + idx_labels) suffix = 2 while ordering_col in columns: ordering_col = f"rowid_{suffix}" suffix += 1 pandas_dataframe_copy = pandas_dataframe.copy() + pandas_dataframe_copy.index.names = new_idx_ids + pandas_dataframe_copy.columns = pandas.Index(new_col_ids) pandas_dataframe_copy[ordering_col] = np.arange(pandas_dataframe_copy.shape[0]) # Specify the datetime dtypes, which is auto-detected as timestamp types. @@ -732,27 +857,12 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame bigquery.SchemaField(column, bigquery.enums.SqlTypeNames.DATETIME) ) - # Unnamed are not copied to BigQuery when load_table_from_dataframe - # executes. - index_cols = list( - filter(lambda name: name is not None, pandas_dataframe_copy.index.names) - ) - index_labels = typing.cast(List[Optional[str]], index_cols) - # Clustering probably not needed anyways as pandas tables are small cluster_cols = [ordering_col] - if len(index_cols) == 0: - # Block constructor will implicitly build default index - pass - job_config = bigquery.LoadJobConfig(schema=schema) job_config.clustering_fields = cluster_cols - # TODO(swast): Rename the unnamed index columns and restore them after - # the load job completes. - # Column values will be loaded as null if the column name has spaces. - # https://github.com/googleapis/python-bigquery/issues/1566 load_table_destination = self._create_session_table() load_job = self.bqclient.load_table_from_dataframe( pandas_dataframe_copy, @@ -770,14 +880,22 @@ def read_pandas(self, pandas_dataframe: pandas.DataFrame) -> dataframe.DataFrame f"SELECT * FROM `{load_table_destination.table_id}`" ) - return self._read_gbq_with_ordering( + # b/297590178 Potentially a bug in bqclient.load_table_from_dataframe(), that only when the DF is empty, the index columns disappear in table_expression. + if any( + [new_idx_id not in table_expression.columns for new_idx_id in new_idx_ids] + ): + new_idx_ids, idx_labels = [], [] + + df = self._read_gbq_with_ordering( table_expression=table_expression, - index_cols=index_cols, - index_labels=index_labels, + col_labels=col_labels, + index_cols=new_idx_ids, + index_labels=idx_labels, hidden_cols=(ordering_col,), ordering=ordering, is_total_ordering=True, ) + return df def read_csv( self, @@ -844,10 +962,9 @@ def read_csv( f"{constants.FEEDBACK_LINK}" ) - valid_encodings = {"UTF-8", "ISO-8859-1"} - if encoding is not None and encoding not in valid_encodings: + if encoding is not None and encoding not in _VALID_ENCODINGS: raise NotImplementedError( - f"BigQuery engine only supports the following encodings: {valid_encodings}. " + f"BigQuery engine only supports the following encodings: {_VALID_ENCODINGS}. " f"{constants.FEEDBACK_LINK}" ) @@ -933,6 +1050,86 @@ def read_parquet( return self._read_bigquery_load_job(path, table, job_config=job_config) + def read_json( + self, + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, + ) -> dataframe.DataFrame: + table = bigquery.Table(self._create_session_table()) + + if engine == "bigquery": + + if dtype is not None: + raise NotImplementedError( + "BigQuery engine does not support the dtype arguments." + ) + + if not lines: + raise NotImplementedError( + "Only newline delimited JSON format is supported." + ) + + if encoding is not None and encoding not in _VALID_ENCODINGS: + raise NotImplementedError( + f"BigQuery engine only supports the following encodings: {_VALID_ENCODINGS}" + ) + + if lines and orient != "records": + raise ValueError( + "'lines' keyword is only valid when 'orient' is 'records'." + ) + + job_config = bigquery.LoadJobConfig() + job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED + job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON + job_config.write_disposition = bigquery.WriteDisposition.WRITE_EMPTY + job_config.autodetect = True + job_config.encoding = encoding + + return self._read_bigquery_load_job( + path_or_buf, + table, + job_config=job_config, + ) + else: + if any(arg in kwargs for arg in ("chunksize", "iterator")): + raise NotImplementedError( + "'chunksize' and 'iterator' arguments are not supported." + ) + + if isinstance(path_or_buf, str): + self._check_file_size(path_or_buf) + + if engine == "ujson": + pandas_df = pandas.read_json( # type: ignore + path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + **kwargs, + ) + + else: + pandas_df = pandas.read_json( # type: ignore + path_or_buf, + orient=orient, + dtype=dtype, + encoding=encoding, + lines=lines, + engine=engine, + **kwargs, + ) + return self.read_pandas(pandas_df) + def _check_file_size(self, filepath: str): max_size = 1024 * 1024 * 1024 # 1 GB in bytes if filepath.startswith("gs://"): # GCS file path @@ -1008,14 +1205,26 @@ def _query_to_session_table( table = self._create_session_table() cluster_cols_sql = ", ".join(f"`{cluster_col}`" for cluster_col in cluster_cols) - # TODO(swast): This might not support multi-statement SQL queries. + # TODO(swast): This might not support multi-statement SQL queries (scripts). ddl_text = f""" CREATE TEMP TABLE `_SESSION`.`{table.table_id}` CLUSTER BY {cluster_cols_sql} AS {query_text} """ + + job_config = bigquery.QueryJobConfig() + + # Include a label so that Dataplex Lineage can identify temporary + # tables that BigQuery DataFrames creates. Googlers: See internal issue + # 296779699. We're labeling the job instead of the table because + # otherwise we get `BadRequest: 400 OPTIONS on temporary tables are not + # supported`. + job_config.labels = {"source": "bigquery-dataframes-temp"} + try: - self._start_query(ddl_text) # Wait for the job to complete + self._start_query( + ddl_text, job_config=job_config + ) # Wait for the job to complete except google.api_core.exceptions.Conflict: # Allow query retry to succeed. pass diff --git a/mypy.ini b/mypy.ini index ce78c4686e..901394813a 100644 --- a/mypy.ini +++ b/mypy.ini @@ -15,6 +15,9 @@ ignore_missing_imports = True [mypy-google.colab] ignore_missing_imports = True +[mypy-google.iam.*] +ignore_missing_imports = True + [mypy-pytz] ignore_missing_imports = True diff --git a/notebooks/dataframes/dataframe.ipynb b/notebooks/dataframes/dataframe.ipynb index 241c767f57..85ea61d281 100644 --- a/notebooks/dataframes/dataframe.ipynb +++ b/notebooks/dataframes/dataframe.ipynb @@ -31,7 +31,22 @@ "execution_count": 2, "id": "96757c59-fc22-420e-a42f-c6cb956110ec", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "944f0e4417154e81b6496302fe756465", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job ac4d1f2b-e9f3-4d95-b78d-57e40eee93fa is RUNNING. Cubs\n", " 175\n", " \n", + " \n", + " 20\n", + " 71ab82a4-6e07-430a-b695-1af3bc42ea61\n", + " 2016\n", + " Nationals\n", + " Cubs\n", + " 257\n", + " \n", + " \n", + " 21\n", + " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 178\n", + " \n", + " \n", + " 22\n", + " 6d111b57-fa0b-4f24-82df-ff33a26f0252\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 171\n", + " \n", + " \n", + " 23\n", + " a97e9539-bbbd-4e03-bf15-f25ea2c1d923\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 248\n", + " \n", + " \n", + " 24\n", + " dc0c9218-505c-4725-8c0c-40b72cca0956\n", + " 2016\n", + " Astros\n", + " Cubs\n", + " 174\n", + " \n", " \n", "\n", + "

25 rows × 5 columns

\n", "[2431 rows x 5 columns in total]" ], "text/plain": [ @@ -419,6 +503,34 @@ "tags": [] }, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "276760df4c904ced81cbaff3a65d026e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 1943ae42-bcbd-4c2f-914f-209377b5c4d9 is DONE. 0 Bytes processed.
Cubs\n", " 175\n", " \n", + " \n", + " 20\n", + " 71ab82a4-6e07-430a-b695-1af3bc42ea61\n", + " 2016\n", + " Nationals\n", + " Cubs\n", + " 257\n", + " \n", + " \n", + " 21\n", + " d1a110c2-f6c8-4029-bcd8-2f8a01e1561c\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 178\n", + " \n", + " \n", + " 22\n", + " 6d111b57-fa0b-4f24-82df-ff33a26f0252\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 171\n", + " \n", + " \n", + " 23\n", + " a97e9539-bbbd-4e03-bf15-f25ea2c1d923\n", + " 2016\n", + " Brewers\n", + " Cubs\n", + " 248\n", + " \n", + " \n", + " 24\n", + " dc0c9218-505c-4725-8c0c-40b72cca0956\n", + " 2016\n", + " Astros\n", + " Cubs\n", + " 174\n", + " \n", " \n", "\n", + "

25 rows × 5 columns

\n", "[2431 rows x 5 columns in total]" ], "text/plain": [ @@ -3063,6 +3768,34 @@ "id": "ac3ceabe-4317-453c-9418-826de5094454", "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c73064d64afe41cea6738085b273e29d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 08477df8-3e86-4f94-a905-8ac2f36e2b69 is DONE. 0 Bytes processed.
core.BqmlModel: +def penguins_bqml_kmeans_model( + session: bigframes.Session, penguins_kmeans_model_name: str +) -> core.BqmlModel: model = session.bqclient.get_model(penguins_kmeans_model_name) return core.BqmlModel(session, model) +@pytest.fixture(scope="session") +def penguins_bqml_pca_model( + session: bigframes.Session, penguins_pca_model_name: str +) -> core.BqmlModel: + model = session.bqclient.get_model(penguins_pca_model_name) + return core.BqmlModel(session, model) + + @pytest.fixture(scope="session") def penguins_linear_model( session, penguins_linear_model_name: str @@ -140,32 +147,12 @@ def penguins_kmeans_model(session, penguins_kmeans_model_name: str) -> cluster.K @pytest.fixture(scope="session") def penguins_pca_model( - session: bigframes.Session, dataset_id_permanent, penguins_table_id + session: bigframes.Session, penguins_pca_model_name: str ) -> decomposition.PCA: - - # TODO(yunmengxie): Create a shared method to get different types of pretrained models. - sql = f""" -CREATE OR REPLACE MODEL `$model_name` -OPTIONS ( - model_type='pca', - num_principal_components=3 -) AS SELECT - * -FROM `{penguins_table_id}`""" - # We use the SQL hash as the name to ensure the model is regenerated if this fixture is edited - model_name = ( - f"{dataset_id_permanent}.penguins_pca_{hashlib.md5(sql.encode()).hexdigest()}" + return cast( + decomposition.PCA, + session.read_gbq_model(penguins_pca_model_name), ) - sql = sql.replace("$model_name", model_name) - - try: - return session.read_gbq_model(model_name) - except google.cloud.exceptions.NotFound: - logging.info( - "penguins_pca_model fixture was not found in the permanent dataset, regenerating it..." - ) - session.bqclient.query(sql).result() - return session.read_gbq_model(model_name) @pytest.fixture(scope="session") diff --git a/tests/system/small/ml/test_core.py b/tests/system/small/ml/test_core.py index 4b184b0d4c..6c3e8e06f5 100644 --- a/tests/system/small/ml/test_core.py +++ b/tests/system/small/ml/test_core.py @@ -140,6 +140,100 @@ def test_model_centroids(penguins_bqml_kmeans_model: core.BqmlModel): ) +def test_pca_model_principal_components(penguins_bqml_pca_model: core.BqmlModel): + result = penguins_bqml_pca_model.principal_components().to_pandas() + assert result.shape == (21, 4) + + # result is too long, only check the first principal component here. + result = result.head(7) + expected = pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + ], + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, + ], + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], + ], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + check_dtype=False, + ) + + +def test_pca_model_principal_component_info(penguins_bqml_pca_model: core.BqmlModel): + result = penguins_bqml_pca_model.principal_component_info().to_pandas() + assert result.shape == (3, 4) + + expected = pd.DataFrame( + { + "principal_component_id": [0, 1, 2], + "eigenvalue": [3.278657, 1.270829, 1.125354], + "explained_variance_ratio": [0.469357, 0.181926, 0.1611], + "cumulative_explained_variance_ratio": [0.469357, 0.651283, 0.812383], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + # int64 Index by default in pandas versus Int64 (nullable) Index in BigQuery DataFrame + check_index_type=False, + check_dtype=False, + ) + + def test_model_predict(penguins_bqml_linear_model: core.BqmlModel, new_penguins_df): predictions = penguins_bqml_linear_model.predict(new_penguins_df).to_pandas() expected = pd.DataFrame( diff --git a/tests/system/small/ml/test_decomposition.py b/tests/system/small/ml/test_decomposition.py index 01d5207750..8df4145fcf 100644 --- a/tests/system/small/ml/test_decomposition.py +++ b/tests/system/small/ml/test_decomposition.py @@ -55,7 +55,7 @@ def test_pca_predict(session, penguins_pca_model: decomposition.PCA): ) -def test_pca_score(session, penguins_pca_model: decomposition.PCA): +def test_pca_score(penguins_pca_model: decomposition.PCA): result = penguins_pca_model.score().to_pandas() expected = pd.DataFrame( {"total_explained_variance_ratio": [0.812383]}, @@ -68,3 +68,110 @@ def test_pca_score(session, penguins_pca_model: decomposition.PCA): rtol=0.1, check_index_type=False, ) + + +def test_pca_components_(penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.components_.to_pandas() + + # result is too long, only check the first principal component here. + result = result.head(7) + expected = pd.DataFrame( + { + "principal_component_id": [0] * 7, + "feature": [ + "species", + "island", + "culmen_length_mm", + "culmen_depth_mm", + "flipper_length_mm", + "body_mass_g", + "sex", + ], + "numerical_value": [ + pd.NA, + pd.NA, + 0.401489, + -0.377482, + 0.524052, + 0.501174, + pd.NA, + ], + "categorical_value": [ + [ + { + "category": "Gentoo penguin (Pygoscelis papua)", + "value": 0.25068877125667804, + }, + { + "category": "Adelie Penguin (Pygoscelis adeliae)", + "value": -0.20622291900416198, + }, + { + "category": "Chinstrap penguin (Pygoscelis antarctica)", + "value": -0.030161149275185855, + }, + ], + [ + {"category": "Biscoe", "value": 0.19761120114410635}, + {"category": "Dream", "value": -0.11264736305259061}, + {"category": "Torgersen", "value": -0.07065913511418596}, + ], + [], + [], + [], + [], + [ + {"category": ".", "value": 0.0015916894448071784}, + {"category": "MALE", "value": 0.06869704739750442}, + {"category": "FEMALE", "value": -0.052521171596813174}, + {"category": "_null_filler", "value": -0.0034628622681684906}, + ], + ], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) + + +def test_pca_explained_variance_(penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.explained_variance_.to_pandas() + + expected = pd.DataFrame( + { + "principal_component_id": [0, 1, 2], + "explained_variance": [3.278657, 1.270829, 1.125354], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) + + +def test_pca_explained_variance_ratio_(penguins_pca_model: decomposition.PCA): + result = penguins_pca_model.explained_variance_ratio_.to_pandas() + + expected = pd.DataFrame( + { + "principal_component_id": [0, 1, 2], + "explained_variance_ratio": [0.469357, 0.181926, 0.1611], + }, + ) + pd.testing.assert_frame_equal( + result, + expected, + check_exact=False, + rtol=0.1, + check_index_type=False, + check_dtype=False, + ) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 5b4f9ebccc..01305adb20 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -77,6 +77,20 @@ def test_df_construct_from_series(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_df_construct_from_dict(): + input_dict = { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + # With a space in column name. We use standardized SQL schema ids to solve the problem that BQ schema doesn't support column names with spaces. b/296751058 + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + bf_result = dataframe.DataFrame(input_dict).to_pandas() + pd_result = pd.DataFrame(input_dict) + + pandas.testing.assert_frame_equal( + bf_result, pd_result, check_dtype=False, check_index_type=False + ) + + def test_get_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs col_name = "int64_col" @@ -356,6 +370,52 @@ def test_assign_new_column_w_setitem(scalars_dfs): pd.testing.assert_frame_equal(bf_result, pd_result) +def test_assign_new_column_w_setitem_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_custom_index(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + # set the custom index + pd_df = pd_df.set_index("string_col") + bf_df = bf_df.set_index("string_col") + + bf_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + pd_df["new_col"] = [9, 8, 7, 6, 5, 4, 3, 2, 1] + bf_result = bf_df.to_pandas() + pd_result = pd_df + + # Convert default pandas dtypes `int64` to match BigQuery DataFrames dtypes. + pd_result["new_col"] = pd_result["new_col"].astype("Int64") + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_assign_new_column_w_setitem_list_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_df = scalars_df.copy() + pd_df = scalars_pandas_df.copy() + + with pytest.raises(ValueError): + pd_df["new_col"] = [1, 2, 3] # should be len 9, is 3 + with pytest.raises(ValueError): + bf_df["new_col"] = [1, 2, 3] + + def test_assign_existing_column(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs kwargs = {"int64_col": 2} @@ -1329,6 +1389,21 @@ def test_df_describe(scalars_dfs): ).all() +def test_df_stack(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + # To match bigquery dataframes + scalars_pandas_df = scalars_pandas_df.copy() + scalars_pandas_df.columns = scalars_pandas_df.columns.astype("string[pyarrow]") + # Can only stack identically-typed columns + columns = ["int64_col", "int64_too", "rowindex_2"] + + bf_result = scalars_df[columns].stack().to_pandas() + pd_result = scalars_pandas_df[columns].stack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pd.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + + @pytest.mark.parametrize( ("values", "index", "columns"), [ @@ -1734,8 +1809,9 @@ def test_df___array__(scalars_df_index, scalars_pandas_df_index): ) -def test_getattr_not_implemented(scalars_df_index): - with pytest.raises(NotImplementedError): +def test_getattr_attribute_error_when_pandas_has(scalars_df_index): + # asof is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): scalars_df_index.asof() diff --git a/tests/system/small/test_dataframe_io.py b/tests/system/small/test_dataframe_io.py index 1f5aa906c8..3886b85f40 100644 --- a/tests/system/small/test_dataframe_io.py +++ b/tests/system/small/test_dataframe_io.py @@ -32,6 +32,7 @@ import bigframes import bigframes.dataframe +import bigframes.pandas as bpd def test_to_pandas_w_correct_dtypes(scalars_df_default_index): @@ -339,51 +340,68 @@ def test_to_parquet_index(scalars_dfs, gcs_folder, index): pd.testing.assert_frame_equal(gcs_df, scalars_pandas_df) +def test_to_sql_query_unnamed_index_included( + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, +): + bf_df = scalars_df_default_index.reset_index(drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True) + assert len(idx_labels) == 1 + assert len(idx_ids) == 1 + assert idx_labels[0] is None + assert idx_ids[0].startswith("bigframes") + + pd_df = scalars_pandas_df_default_index.reset_index(drop=True) + roundtrip = session.read_gbq(sql, index_col=idx_ids) + roundtrip.index.names = [None] + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) + + def test_to_sql_query_named_index_included( - session, scalars_df_index, scalars_pandas_df_index + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, ): - sql, index_columns = scalars_df_index._to_sql_query(always_include_index=True) - assert len(index_columns) == 1 - index_column, is_named = index_columns[0] - assert index_column == "rowindex" - assert is_named - - roundtrip = session.read_gbq(sql, index_col=[index_column]) - assert_pandas_df_equal_ignore_ordering( - roundtrip.to_pandas(), scalars_pandas_df_index - ) + bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=True) + assert len(idx_labels) == 1 + assert len(idx_ids) == 1 + assert idx_labels[0] == "rowindex_2" + assert idx_ids[0] == "rowindex_2" + + pd_df = scalars_pandas_df_default_index.set_index("rowindex_2", drop=True) + roundtrip = session.read_gbq(sql, index_col=idx_ids) + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) def test_to_sql_query_unnamed_index_excluded( - session, scalars_df_default_index, scalars_pandas_df_default_index + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, ): - # The .sql property should return SQL without the unnamed indexes - sql, index_columns = scalars_df_default_index._to_sql_query( - always_include_index=False - ) - assert len(index_columns) == 0 + bf_df = scalars_df_default_index.reset_index(drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False) + assert len(idx_labels) == 0 + assert len(idx_ids) == 0 + pd_df = scalars_pandas_df_default_index.reset_index(drop=True) roundtrip = session.read_gbq(sql) - assert_pandas_df_equal_ignore_ordering( - roundtrip.to_pandas(), scalars_pandas_df_default_index - ) + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) -def test_to_sql_query_unnamed_index_always_include( - session, - scalars_df_default_index: bigframes.dataframe.DataFrame, - scalars_pandas_df_default_index, +def test_to_sql_query_named_index_excluded( + session: bigframes.Session, + scalars_df_default_index: bpd.DataFrame, + scalars_pandas_df_default_index: pd.DataFrame, ): - sql, index_columns = scalars_df_default_index._to_sql_query( - always_include_index=True - ) - assert len(index_columns) == 1 - index_column, is_named = index_columns[0] - assert index_column == "bigframes_index_0" - assert not is_named - - roundtrip = session.read_gbq(sql, index_col=[index_column]) - roundtrip.index.name = None - assert_pandas_df_equal_ignore_ordering( - roundtrip.to_pandas(), scalars_pandas_df_default_index - ) + bf_df = scalars_df_default_index.set_index("rowindex_2", drop=True) + sql, idx_ids, idx_labels = bf_df._to_sql_query(include_index=False) + assert len(idx_labels) == 0 + assert len(idx_ids) == 0 + + pd_df = scalars_pandas_df_default_index.set_index( + "rowindex_2", drop=True + ).reset_index(drop=True) + roundtrip = session.read_gbq(sql) + assert_pandas_df_equal_ignore_ordering(roundtrip.to_pandas(), pd_df) diff --git a/tests/system/small/test_groupby.py b/tests/system/small/test_groupby.py index 5a2562bfb2..987368ce77 100644 --- a/tests/system/small/test_groupby.py +++ b/tests/system/small/test_groupby.py @@ -112,7 +112,9 @@ def test_dataframe_groupby_agg_list(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) -def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index): +def test_dataframe_groupby_agg_dict_with_list( + scalars_df_index, scalars_pandas_df_index +): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( scalars_df_index[col_names] @@ -129,6 +131,23 @@ def test_dataframe_groupby_agg_dict(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) +def test_dataframe_groupby_agg_dict_no_lists(scalars_df_index, scalars_pandas_df_index): + col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] + bf_result = ( + scalars_df_index[col_names] + .groupby("string_col") + .agg({"int64_too": "mean", "string_col": "count"}) + ) + pd_result = ( + scalars_pandas_df_index[col_names] + .groupby("string_col") + .agg({"int64_too": "mean", "string_col": "count"}) + ) + bf_result_computed = bf_result.to_pandas() + + pd.testing.assert_frame_equal(pd_result, bf_result_computed, check_dtype=False) + + def test_dataframe_groupby_agg_named(scalars_df_index, scalars_pandas_df_index): col_names = ["int64_too", "float64_col", "int64_col", "bool_col", "string_col"] bf_result = ( diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index b2937d7da9..1baf3e6650 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -620,3 +620,39 @@ def test_column_multi_index_cumsum(scalars_df_index, scalars_pandas_df_index): pd_result = pd_df.cumsum() pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", "a", "b"]) + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index(["a", "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.stack().to_pandas() + pd_result = pd_df.stack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): + columns = ["int64_too", "int64_col", "rowindex_2"] + level1 = pandas.Index(["b", pandas.NA, pandas.NA]) + # Need resulting column to be pyarrow string rather than object dtype + level2 = pandas.Index([pandas.NA, "b", "b"], dtype="string[pyarrow]") + multi_columns = pandas.MultiIndex.from_arrays([level1, level2]) + bf_df = scalars_df_index[columns].copy() + bf_df.columns = multi_columns + pd_df = scalars_pandas_df_index[columns].copy() + pd_df.columns = multi_columns + + bf_result = bf_df.stack().to_pandas() + pd_result = pd_df.stack() + + # Pandas produces NaN, where bq dataframes produces pd.NA + pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py new file mode 100644 index 0000000000..fff689caba --- /dev/null +++ b/tests/system/small/test_numpy.py @@ -0,0 +1,69 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pandas as pd +import pytest + + +@pytest.mark.parametrize( + ("opname",), + [ + ("sin",), + ("cos",), + ("tan",), + ("arcsin",), + ("arccos",), + ("arctan",), + ("sinh",), + ("cosh",), + ("tanh",), + ("arcsinh",), + ("arccosh",), + ("arctanh",), + ("exp",), + ("log",), + ("log10",), + ("sqrt",), + ("abs",), + ], +) +def test_series_ufuncs(floats_pd, floats_bf, opname): + bf_result = getattr(np, opname)(floats_bf).to_pandas() + pd_result = getattr(np, opname)(floats_pd) + + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("opname",), + [ + ("sin",), + ("cos",), + ("tan",), + ("log",), + ("log10",), + ("sqrt",), + ("abs",), + ], +) +def test_df_ufuncs(scalars_dfs, opname): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = getattr(np, opname)( + scalars_df[["float64_col", "int64_col"]] + ).to_pandas() + pd_result = getattr(np, opname)(scalars_pandas_df[["float64_col", "int64_col"]]) + + pd.testing.assert_frame_equal(bf_result, pd_result) diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index e40addc4eb..c60d270fca 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -116,6 +116,7 @@ def test_remote_function_direct_no_session_param( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection, @@ -126,6 +127,7 @@ def test_remote_function_direct_no_session_param( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection, # See e2e tests for tests that actually deploy the Cloud Function. @@ -166,6 +168,7 @@ def test_remote_function_direct_no_session_param_location_specified( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection_location, @@ -176,6 +179,7 @@ def test_remote_function_direct_no_session_param_location_specified( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location, # See e2e tests for tests that actually deploy the Cloud Function. @@ -213,6 +217,7 @@ def test_remote_function_direct_no_session_param_location_mismatched( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, dataset_id_permanent, bq_cf_connection_location_mismatched, ): @@ -224,6 +229,7 @@ def test_remote_function_direct_no_session_param_location_mismatched( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. @@ -238,6 +244,7 @@ def test_remote_function_direct_no_session_param_location_project_specified( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_dfs, dataset_id_permanent, bq_cf_connection_location_project, @@ -248,6 +255,7 @@ def test_remote_function_direct_no_session_param_location_project_specified( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project, # See e2e tests for tests that actually deploy the Cloud Function. @@ -285,6 +293,7 @@ def test_remote_function_direct_no_session_param_project_mismatched( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, dataset_id_permanent, bq_cf_connection_location_project_mismatched, ): @@ -296,6 +305,7 @@ def test_remote_function_direct_no_session_param_project_mismatched( bigquery_client=bigquery_client, bigquery_connection_client=bigqueryconnection_client, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, dataset=dataset_id_permanent, bigquery_connection=bq_cf_connection_location_project_mismatched, # See e2e tests for tests that actually deploy the Cloud Function. @@ -530,6 +540,7 @@ def test_read_gbq_function_like_original( bigquery_client, bigqueryconnection_client, cloudfunctions_client, + resourcemanager_client, scalars_df_index, dataset_id_permanent, bq_cf_connection, @@ -541,6 +552,7 @@ def test_read_gbq_function_like_original( bigquery_connection_client=bigqueryconnection_client, dataset=dataset_id_permanent, cloud_functions_client=cloudfunctions_client, + resource_manager_client=resourcemanager_client, bigquery_connection=bq_cf_connection, reuse=True, ) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 70c56e5e13..88ad2245c9 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -459,6 +459,19 @@ def test_mods(scalars_dfs, col_x, col_y, method): pd.testing.assert_series_equal(pd_result, bf_result) +# We work around a pandas bug that doesn't handle correlating nullable dtypes by doing this +# manually with dumb self-correlation instead of parameterized as test_mods is above. +def test_corr(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + bf_result = scalars_df["int64_too"].corr(scalars_df["int64_too"]) + pd_result = ( + scalars_pandas_df["int64_too"] + .astype("int64") + .corr(scalars_pandas_df["int64_too"].astype("int64")) + ) + assert math.isclose(pd_result, bf_result) + + @pytest.mark.parametrize( ("col_x",), [ @@ -900,7 +913,7 @@ def test_binop_repeated_application_does_row_identity_joins(scalars_dfs): pd_result, ) - bf_sql, _ = bf_series.to_frame()._to_sql_query(always_include_index=True) + bf_sql, _, _ = bf_series.to_frame()._to_sql_query(include_index=True) selects = re.findall("SELECT", bf_sql.upper()) assert 0 < len(selects) < (num_joins // 2) @@ -2222,8 +2235,9 @@ def test_argmax(scalars_df_index, scalars_pandas_df_index): assert bf_result == pd_result -def test_getattr_not_implemented(scalars_df_index): - with pytest.raises(NotImplementedError): +def test_getattr_attribute_error_when_pandas_has(scalars_df_index): + # asof is implemented in pandas but not in bigframes + with pytest.raises(AttributeError): scalars_df_index.string_col.asof() diff --git a/tests/system/small/test_session.py b/tests/system/small/test_session.py index d825c62561..b7bee16ffd 100644 --- a/tests/system/small/test_session.py +++ b/tests/system/small/test_session.py @@ -111,6 +111,54 @@ def test_read_gbq_w_col_order( ["uuid"], id="unique_uuid_index_query", ), + pytest.param( + """ + SELECT my_index, my_value + FROM UNNEST( + [ + STRUCT(0, 12), + STRUCT(1, 12), + STRUCT(2, 24) + ] + ) + -- Can't normally cluster tables with ORDER BY clause. + ORDER BY my_index DESC + """, + ["my_index"], + id="unique_index_query_has_order_by", + ), + pytest.param( + """ + WITH my_table AS ( + SELECT * + FROM UNNEST( + [ + STRUCT(0, 12), + STRUCT(1, 12), + STRUCT(2, 24) + ] + ) + ) + SELECT my_index, my_value FROM my_table + """, + ["my_index"], + id="unique_index_query_with_named_table_expression", + ), + pytest.param( + """ + CREATE TEMP TABLE test_read_gbq_w_index_col_unique_index_query_with_script + AS SELECT * FROM UNNEST( + [ + STRUCT(0, 12), + STRUCT(1, 12), + STRUCT(2, 24) + ] + ); + SELECT my_index, my_value FROM test_read_gbq_w_index_col_unique_index_query_with_script + """, + ["my_index"], + id="unique_index_query_with_script", + ), pytest.param( "{scalars_table_id}", ["bool_col"], @@ -221,7 +269,7 @@ def test_read_gbq_w_max_results( assert bf_result.shape[0] == max_results -def test_read_gbq_w_script(session, dataset_id: str): +def test_read_gbq_w_script_no_select(session, dataset_id: str): ddl = f""" CREATE TABLE `{dataset_id}.test_read_gbq_w_ddl` ( `col_a` INT64, @@ -252,6 +300,20 @@ def test_read_pandas(session, scalars_dfs): pd.testing.assert_frame_equal(result, expected) +def test_read_pandas_col_label_w_space(session: bigframes.Session): + expected = pd.DataFrame( + { + "Animal": ["Falcon", "Falcon", "Parrot", "Parrot"], + "Max Speed": [380.0, 370.0, 24.0, 26.0], + } + ) + result = session.read_pandas(expected).to_pandas() + + pd.testing.assert_frame_equal( + result, expected, check_index_type=False, check_dtype=False + ) + + def test_read_pandas_multi_index(session, scalars_pandas_df_multi_index): df = session.read_pandas(scalars_pandas_df_multi_index) result = df.to_pandas() @@ -755,6 +817,67 @@ def test_read_parquet_gcs(session: bigframes.Session, scalars_dfs, gcs_folder): pd.testing.assert_frame_equal(pd_df_in, pd_df_out) +def test_read_json_gcs_bq_engine(session, scalars_dfs, gcs_folder): + scalars_df, _ = scalars_dfs + path = gcs_folder + "test_read_json_gcs_bq_engine_w_index*.json" + read_path = path.replace("*", FIRST_FILE) + scalars_df.to_json(path, index=False, lines=True, orient="records") + df = session.read_json(read_path, lines=True, orient="records", engine="bigquery") + + # The auto detects of BigQuery load job does not preserve any ordering of columns for json. + pd.testing.assert_index_equal( + df.columns.sort_values(), scalars_df.columns.sort_values() + ) + + # The auto detects of BigQuery load job have restrictions to detect the bytes, + # datetime, numeric and geometry types, so they're skipped here. + df = df.drop(columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"]) + scalars_df = scalars_df.drop( + columns=["bytes_col", "datetime_col", "numeric_col", "geography_col"] + ) + assert df.shape[0] == scalars_df.shape[0] + pd.testing.assert_series_equal( + df.dtypes.sort_index(), scalars_df.dtypes.sort_index() + ) + + +def test_read_json_gcs_default_engine(session, scalars_dfs, gcs_folder): + scalars_df, _ = scalars_dfs + path = gcs_folder + "test_read_json_gcs_default_engine_w_index*.json" + read_path = path.replace("*", FIRST_FILE) + scalars_df.to_json( + path, + index=False, + lines=True, + orient="records", + ) + dtype = scalars_df.dtypes.to_dict() + dtype.pop("geography_col") + + df = session.read_json( + read_path, + # Convert default pandas dtypes to match BigQuery DataFrames dtypes. + dtype=dtype, + lines=True, + orient="records", + ) + + assert df._block._expr._ordering is not None + pd.testing.assert_index_equal(df.columns, scalars_df.columns) + + # The auto detects of BigQuery load job have restrictions to detect the bytes, + # numeric and geometry types, so they're skipped here. + df = df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) + scalars_df = scalars_df.drop(columns=["bytes_col", "numeric_col", "geography_col"]) + + # pandas read_json does not respect the dtype overrides for these columns + df = df.drop(columns=["date_col", "datetime_col", "time_col"]) + scalars_df = scalars_df.drop(columns=["date_col", "datetime_col", "time_col"]) + + assert df.shape[0] == scalars_df.shape[0] + pd.testing.assert_series_equal(df.dtypes, scalars_df.dtypes) + + def test_session_id(session): assert session._session_id is not None diff --git a/tests/unit/core/test_utils.py b/tests/unit/core/test_utils.py new file mode 100644 index 0000000000..fc34f35d9c --- /dev/null +++ b/tests/unit/core/test_utils.py @@ -0,0 +1,56 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from bigframes.core import utils + + +def test_get_standardized_ids_columns(): + col_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] + + col_ids, idx_ids = utils.get_standardized_ids(col_labels) + + assert col_ids == [ + "string", + "0", + utils.UNNAMED_COLUMN_ID, + "duplicate", + "duplicate.1", + "with_space", + ] + assert idx_ids == [] + + +def test_get_standardized_ids_indexes(): + col_labels = ["duplicate"] + idx_labels = ["string", 0, None, "duplicate", "duplicate", "with space"] + + col_ids, idx_ids = utils.get_standardized_ids(col_labels, idx_labels) + + assert col_ids == ["duplicate.2"] + assert idx_ids == [ + "string", + "0", + utils.UNNAMED_INDEX_ID, + "duplicate", + "duplicate.1", + "with_space", + ] + + +def test_get_standardized_ids_tuple(): + col_labels = [("foo", 1), ("foo", 2), ("bar", 1)] + + col_ids, _ = utils.get_standardized_ids(col_labels) + + assert col_ids == ["('foo',_1)", "('foo',_2)", "('bar',_1)"] diff --git a/tests/unit/ml/test_sql.py b/tests/unit/ml/test_sql.py index d8c8a2d108..c20a17f7d6 100644 --- a/tests/unit/ml/test_sql.py +++ b/tests/unit/ml/test_sql.py @@ -162,3 +162,18 @@ def test_ml_generate_text_produces_correct_sql(): == """SELECT * FROM ML.GENERATE_TEXT(MODEL `my_dataset.my_model`, (SELECT * FROM my_table), STRUCT(value AS item))""" ) + + +def test_ml_principal_components_produces_correct_sql(): + sql = ml_sql.ml_principal_components(model_name="my_dataset.my_model") + assert ( + sql == """SELECT * FROM ML.PRINCIPAL_COMPONENTS(MODEL `my_dataset.my_model`)""" + ) + + +def test_ml_principal_component_info_produces_correct_sql(): + sql = ml_sql.ml_principal_component_info(model_name="my_dataset.my_model") + assert ( + sql + == """SELECT * FROM ML.PRINCIPAL_COMPONENT_INFO(MODEL `my_dataset.my_model`)""" + ) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 6762afc61f..5a812dae7e 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -15,6 +15,7 @@ import numpy +from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame # ----------------------------------------------------------------------- @@ -33,7 +34,7 @@ class DataFrame(NDFrame): @property def shape(self) -> tuple[int, int]: """Return a tuple representing the dimensionality of the DataFrame.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def axes(self) -> list: @@ -67,7 +68,7 @@ def values(self) -> numpy.ndarray: na_value (default None): The value to use for missing values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # IO methods (to / from other formats) @@ -90,7 +91,7 @@ def to_numpy( Returns: numpy.ndarray: The converted NumPy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_gbq( self, @@ -124,7 +125,7 @@ def to_gbq( If set, write the ordering of the DataFrame as a column in the result table with this name. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_parquet( self, @@ -151,7 +152,7 @@ def to_parquet( Returns: None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Unsorted @@ -179,7 +180,7 @@ def assign(self, **kwargs) -> DataFrame: bigframes.dataframe.DataFrame: A new DataFrame with the new columns in addition to all the existing columns. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Reindexing and alignment @@ -211,7 +212,7 @@ def drop( Raises: KeyError: If any of the labels is not found in the selected axis. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename( self, @@ -233,7 +234,7 @@ def rename( Raises: KeyError: If any of the labels is not found. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: """ @@ -250,7 +251,7 @@ def rename_axis(self, mapper: Optional[str], **kwargs) -> DataFrame: Returns: bigframes.dataframe.DataFrame: DataFrame with the new index name """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def set_index( self, @@ -273,7 +274,7 @@ def set_index( Returns: DataFrame: Changed row labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: """ @@ -287,7 +288,7 @@ def reorder_levels(self, order: Sequence[int | str]) -> DataFrame: Returns: DataFrame: DataFrame of rearranged index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def droplevel(self, level): """ @@ -301,7 +302,7 @@ def droplevel(self, level): Returns: DataFrame: DataFrame with requested index / column level(s) removed. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reset_index( self, @@ -320,7 +321,7 @@ def reset_index( Returns: bigframes.dataframe.DataFrame: DataFrame with the new index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def drop_duplicates( self, @@ -347,7 +348,7 @@ def drop_duplicates( Returns: bigframes.dataframe.DataFrame: DataFrame with duplicates removed """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def duplicated(self, subset=None, keep="first"): """ @@ -369,7 +370,7 @@ def duplicated(self, subset=None, keep="first"): Returns: bigframes.series.Series: Boolean series for each duplicated rows. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Reindex-based selection methods @@ -382,7 +383,7 @@ def dropna( Returns: bigframes.dataframe.DataFrame: DataFrame with NA entries dropped from it. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Sorting @@ -415,7 +416,7 @@ def sort_values( Returns: DataFrame with sorted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_index( self, @@ -425,7 +426,7 @@ def sort_index( Returns: The original DataFrame sorted by the labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Arithmetic Methods @@ -450,7 +451,7 @@ def eq(self, other, axis: str | int = "columns") -> DataFrame: Returns: Result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ne(self, other, axis: str | int = "columns") -> DataFrame: """ @@ -471,7 +472,7 @@ def ne(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: Result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def le(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than or equal to' of dataframe and other, element-wise (binary operator `<=`). @@ -497,7 +498,7 @@ def le(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'less than' of DataFrame and other, element-wise (binary operator `<`). @@ -523,7 +524,7 @@ def lt(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ge(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than or equal to' of DataFrame and other, element-wise (binary operator `>=`). @@ -549,7 +550,7 @@ def ge(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def gt(self, other, axis: str | int = "columns") -> DataFrame: """Get 'greater than' of DataFrame and other, element-wise (binary operator `>`). @@ -575,7 +576,7 @@ def gt(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame of bool: The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add(self, other, axis: str | int = "columns") -> DataFrame: """Get addition of DataFrame and other, element-wise (binary operator `+`). @@ -598,7 +599,7 @@ def add(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sub(self, other, axis: str | int = "columns") -> DataFrame: """Get subtraction of DataFrame and other, element-wise (binary operator `-`). @@ -621,7 +622,7 @@ def sub(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rsub(self, other, axis: str | int = "columns") -> DataFrame: """Get subtraction of DataFrame and other, element-wise (binary operator `-`). @@ -644,7 +645,7 @@ def rsub(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mul(self, other, axis: str | int = "columns") -> DataFrame: """Get multiplication of DataFrame and other, element-wise (binary operator `*`). @@ -667,7 +668,7 @@ def mul(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def truediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -690,7 +691,7 @@ def truediv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: """Get floating division of DataFrame and other, element-wise (binary operator `/`). @@ -713,7 +714,7 @@ def rtruediv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -736,7 +737,7 @@ def floordiv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: """Get integer division of DataFrame and other, element-wise (binary operator `//`). @@ -759,7 +760,7 @@ def rfloordiv(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -782,7 +783,7 @@ def mod(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmod(self, other, axis: str | int = "columns") -> DataFrame: """Get modulo of DataFrame and other, element-wise (binary operator `%`). @@ -805,7 +806,7 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: Returns: DataFrame: DataFrame result of the arithmetic operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Data reshaping @@ -847,7 +848,7 @@ def groupby( Returns: bigframes.core.groupby.SeriesGroupBy: A groupby object that contains information about the groups. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Function application @@ -874,7 +875,7 @@ def map(self, func, na_action: Optional[str] = None) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Transformed DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Merging / joining methods @@ -902,7 +903,7 @@ def join(self, other, *, on: Optional[str] = None, how: str) -> DataFrame: Returns: bigframes.dataframe.DataFrame: A dataframe containing columns from both the caller and `other`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def merge( self, @@ -972,7 +973,7 @@ def merge( Returns: bigframes.dataframe.DataFrame: A DataFrame of the two merged objects. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # ndarray-like stats methods @@ -992,7 +993,7 @@ def any(self, *, bool_only: bool = False): Returns: Series """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def all(self, *, bool_only: bool = False): """ @@ -1009,7 +1010,7 @@ def all(self, *, bool_only: bool = False): Returns: bigframes.series.Series: Series if all elements are True. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def prod(self, *, numeric_only: bool = False): """ @@ -1022,7 +1023,7 @@ def prod(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the product of the values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min(self, *, numeric_only: bool = False): """Return the minimum of the values over the requested axis. @@ -1037,7 +1038,7 @@ def min(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the minimum of the values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max(self, *, numeric_only: bool = False): """Return the maximum of the values over the requested axis. @@ -1052,7 +1053,7 @@ def max(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series after the maximum of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum(self, *, numeric_only: bool = False): """Return the sum of the values over the requested axis. @@ -1066,7 +1067,7 @@ def sum(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the sum of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean(self, *, numeric_only: bool = False): """Return the mean of the values over the requested axis. @@ -1078,7 +1079,7 @@ def mean(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with the mean of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median(self, *, numeric_only: bool = False, exact: bool = False): """Return the median of the values over the requested axis. @@ -1093,7 +1094,7 @@ def median(self, *, numeric_only: bool = False, exact: bool = False): Returns: bigframes.series.Series: Series with the median of values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var(self, *, numeric_only: bool = False): """Return unbiased variance over requested axis. @@ -1107,7 +1108,7 @@ def var(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with unbiased variance over requested axis. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std(self, *, numeric_only: bool = False): """Return sample standard deviation over requested axis. @@ -1121,7 +1122,7 @@ def std(self, *, numeric_only: bool = False): Returns: bigframes.series.Series: Series with sample standard deviation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def count(self, *, numeric_only: bool = False): """ @@ -1138,7 +1139,7 @@ def count(self, *, numeric_only: bool = False): bigframes.series.Series: For each column/row the number of non-NA/null entries. If `level` is specified returns a `DataFrame`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nunique(self): """ @@ -1147,7 +1148,7 @@ def nunique(self): Returns: bigframes.series.Series: Series with number of distinct elements. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self) -> DataFrame: """Return cumulative minimum over a DataFrame axis. @@ -1157,7 +1158,7 @@ def cummin(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative minimum of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self) -> DataFrame: """Return cumulative maximum over a DataFrame axis. @@ -1167,7 +1168,7 @@ def cummax(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative maximum of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self) -> DataFrame: """Return cumulative sum over a DataFrame axis. @@ -1177,7 +1178,7 @@ def cumsum(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative sum of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self) -> DataFrame: """Return cumulative product over a DataFrame axis. @@ -1187,7 +1188,7 @@ def cumprod(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: Return cumulative product of DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def agg(self, func): """ @@ -1202,7 +1203,7 @@ def agg(self, func): Returns: DataFrame or bigframes.series.Series: Aggregated results. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def describe(self): """ @@ -1227,7 +1228,7 @@ def describe(self): Returns: bigframes.dataframe.DataFrame: Summary statistics of the Series or Dataframe provided. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pivot(self, *, columns, index=None, values=None): """ @@ -1263,7 +1264,30 @@ def pivot(self, *, columns, index=None, values=None): Returns: Returns reshaped DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def stack(self): + """ + Stack the prescribed level(s) from columns to index. + + Return a reshaped DataFrame or Series having a multi-level + index with one or more new inner-most levels compared to the current + DataFrame. The new inner-most levels are created by pivoting the + columns of the current dataframe: + + - if the columns have a single level, the output is a Series; + - if the columns have multiple levels, the new index + level(s) is (are) taken from the prescribed level(s) and + the output is a DataFrame. + + .. note:: + BigQuery DataFrames does not support stack operations that would + combine columns of different dtypes. + + Returns: + DataFrame or Series: Stacked dataframe or series. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Add index and columns @@ -1280,12 +1304,12 @@ def index(self): Returns: The index labels of the DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def columns(self): "The column labels of the DataFrame." - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def value_counts( self, @@ -1313,4 +1337,4 @@ def value_counts( Returns: Series: Series containing counts of unique rows in the DataFrame """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/generic.py b/third_party/bigframes_vendored/pandas/core/generic.py index 4843c971da..56d3b2434f 100644 --- a/third_party/bigframes_vendored/pandas/core/generic.py +++ b/third_party/bigframes_vendored/pandas/core/generic.py @@ -3,6 +3,7 @@ from typing import Literal, Optional +from bigframes import constants from third_party.bigframes_vendored.pandas.core import indexing @@ -22,7 +23,7 @@ def ndim(self) -> int: Returns: int: Return 1 if Series. Otherwise return 2 if DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def size(self) -> int: @@ -32,7 +33,7 @@ def size(self) -> int: int: Return the number of rows if Series. Otherwise return the number of rows times number of columns if DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ------------------------------------------------------------------------- # Unary Methods @@ -46,7 +47,7 @@ def abs(self): Series/DataFrame containing the absolute value of each element. Returns a Series/DataFrame containing the absolute value of each element. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def astype(self, dtype): """ @@ -66,7 +67,7 @@ def astype(self, dtype): same type as caller """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Iteration @@ -85,7 +86,7 @@ def empty(self) -> bool: Returns: bool: If Series/DataFrame is empty, return True, if not return False. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # I/O Methods @@ -155,7 +156,7 @@ def to_json( Returns: None: String output not yet supported. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: """Write object to a comma-separated values (csv) file on Cloud Storage. @@ -177,7 +178,7 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: Returns: None: String output not yet supported. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Unsorted @@ -215,7 +216,7 @@ def add_prefix(self, prefix: str, axis: int | str | None = None): Returns: New Series or DataFrame with updated labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add_suffix(self, suffix: str, axis: int | str | None = None): """Suffix labels with string `suffix`. @@ -233,7 +234,7 @@ def add_suffix(self, suffix: str, axis: int | str | None = None): Returns: New Series or DataFrame with updated labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def head(self, n: int = 5): """Return the first `n` rows. @@ -254,7 +255,7 @@ def head(self, n: int = 5): Returns: The first `n` rows of the caller object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def tail(self, n: int = 5): """Return the last `n` rows. @@ -275,7 +276,7 @@ def tail(self, n: int = 5): Returns: The last `n` rows of the caller object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sample( self, @@ -301,7 +302,7 @@ def sample( A new object of same type as caller containing `n` items randomly sampled from the caller object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Internal Interface Methods @@ -317,7 +318,7 @@ def dtypes(self): Returns: A *pandas* Series with the data type of each column. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def copy(self): """Make a copy of this object's indices and data. @@ -329,7 +330,7 @@ def copy(self): Returns: Object type matches caller. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # Action Methods @@ -346,7 +347,7 @@ def isna(self) -> NDFrame: Mask of bool values for each element that indicates whether an element is an NA value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) isnull = isna @@ -362,7 +363,7 @@ def notna(self) -> NDFrame: NDFrame: Mask of bool values for each element that indicates whether an element is not an NA value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) notnull = notna @@ -381,7 +382,7 @@ def shift( Returns: NDFrame: Copy of input object, shifted. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rank( self, @@ -419,7 +420,7 @@ def rank( Returns: same type as caller: Return a Series or DataFrame with data ranks as values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __nonzero__(self): raise ValueError( diff --git a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py index 95822718c3..9271da8a5e 100644 --- a/third_party/bigframes_vendored/pandas/core/groupby/__init__.py +++ b/third_party/bigframes_vendored/pandas/core/groupby/__init__.py @@ -9,6 +9,8 @@ class providing the base-class of operations. """ from __future__ import annotations +from bigframes import constants + class GroupBy: """ @@ -24,7 +26,7 @@ def any(self): where a value is True if any element is True within its respective group, False otherwise. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def all(self): """ @@ -35,7 +37,7 @@ def all(self): where a value is True if all elements are True within its respective group, False otherwise. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def count(self): """ @@ -44,7 +46,7 @@ def count(self): Returns: Series or DataFrame: Count of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean( self, @@ -60,7 +62,7 @@ def mean( Returns: pandas.Series or pandas.DataFrame: Mean of groups. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median( self, @@ -81,7 +83,7 @@ def median( Returns: pandas.Series or pandas.DataFrame: Median of groups. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std( self, @@ -100,7 +102,7 @@ def std( Returns: Series or DataFrame: Standard deviation of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var( self, @@ -120,7 +122,7 @@ def var( Series or DataFrame Variance of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum( self, @@ -140,7 +142,7 @@ def sum( Returns: Series or DataFrame: Computed sum of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def prod(self, numeric_only: bool = False, min_count: int = 0): """ @@ -156,7 +158,7 @@ def prod(self, numeric_only: bool = False, min_count: int = 0): Returns: Series or DataFrame: Computed prod of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min( self, @@ -176,7 +178,7 @@ def min( Returns: Series or DataFrame: Computed min of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max( self, @@ -196,7 +198,7 @@ def max( Returns: Series or DataFrame: Computed max of values within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumcount(self, ascending: bool = True): """ @@ -209,7 +211,7 @@ def cumcount(self, ascending: bool = True): Returns: Series: Sequence number of each element within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self, *args, **kwargs): """ @@ -218,7 +220,7 @@ def cumprod(self, *args, **kwargs): Returns: Series or DataFrame: Cumulative product for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self, *args, **kwargs): """ @@ -227,7 +229,7 @@ def cumsum(self, *args, **kwargs): Returns: Series or DataFrame: Cumulative sum for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self, *args, numeric_only: bool = False, **kwargs): """ @@ -236,7 +238,7 @@ def cummin(self, *args, numeric_only: bool = False, **kwargs): Returns: Series or DataFrame: Cumulative min for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self, *args, numeric_only: bool = False, **kwargs): """ @@ -245,7 +247,7 @@ def cummax(self, *args, numeric_only: bool = False, **kwargs): Returns: Series or DataFrame: Cumulative max for each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def diff(self): """ @@ -256,7 +258,7 @@ def diff(self): Returns: Series or DataFrame: First differences. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def shift(self, periods: int = 1): """ @@ -269,7 +271,7 @@ def shift(self, periods: int = 1): Returns: Series or DataFrame: Object shifted within each group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rolling(self, *args, **kwargs): """ @@ -289,7 +291,7 @@ def rolling(self, *args, **kwargs): Returns: Series or DataFrame: Return a new grouper with our rolling appended. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def expanding(self, *args, **kwargs): """ @@ -298,7 +300,7 @@ def expanding(self, *args, **kwargs): Returns: Series or DataFrame: A expanding grouper, providing expanding functionality per group. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class SeriesGroupBy(GroupBy): @@ -318,7 +320,7 @@ def agg(self, func): Returns: Series or DataFrame """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class DataFrameGroupBy(GroupBy): @@ -347,4 +349,4 @@ def agg(self, func, **kwargs): Returns: DataFrame """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py index d59886e8aa..2b4a326317 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/accessor.py @@ -1,3 +1,6 @@ +from bigframes import constants + + class DatetimeProperties: """ Accessor object for datetime-like properties of the Series values. @@ -7,7 +10,7 @@ class DatetimeProperties: def day(self): """The day of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def dayofweek(self): @@ -22,7 +25,7 @@ def dayofweek(self): Series or Index: Containing integers indicating the day number. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def date(self): @@ -36,31 +39,31 @@ def date(self): a numpy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def hour(self): """The hours of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def minute(self): """The minutes of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def month(self): """The month as January=1, December=12.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def second(self): """The seconds of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def time(self): @@ -73,7 +76,7 @@ def time(self): a numpy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def quarter(self): @@ -84,10 +87,10 @@ def quarter(self): a numpy array. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def year(self): """The year of the datetime.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexes/base.py b/third_party/bigframes_vendored/pandas/core/indexes/base.py index ebad5eb918..404a99809c 100644 --- a/third_party/bigframes_vendored/pandas/core/indexes/base.py +++ b/third_party/bigframes_vendored/pandas/core/indexes/base.py @@ -1,5 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexes/base.py +from bigframes import constants + class Index: """Immutable sequence used for indexing and alignment. @@ -10,14 +12,14 @@ class Index: @property def name(self): """Returns Index name.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def shape(self): """ Return a tuple of the shape of the underlying data. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_numpy(self, dtype): """ @@ -33,4 +35,4 @@ def to_numpy(self, dtype): Returns: numpy.ndarray """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/indexing.py b/third_party/bigframes_vendored/pandas/core/indexing.py index d5b9f3c079..fae5d6261f 100644 --- a/third_party/bigframes_vendored/pandas/core/indexing.py +++ b/third_party/bigframes_vendored/pandas/core/indexing.py @@ -1,5 +1,7 @@ # Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/indexing.py +from bigframes import constants + class IndexingMixin: """ @@ -32,7 +34,7 @@ def iloc(self): out-of-bounds, except *slice* indexers which allow out-of-bounds indexing (this conforms with python/numpy *slice* semantics). """ - raise NotImplementedError("abstract methdod") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def loc(self): @@ -63,4 +65,4 @@ def loc(self): NotImplementError: if the inputs are not supported. """ - raise NotImplementedError("abstract methdod") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/concat.py b/third_party/bigframes_vendored/pandas/core/reshape/concat.py index 6a5a9fdde9..6e6d2d8b5c 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/concat.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/concat.py @@ -4,6 +4,8 @@ """ from __future__ import annotations +from bigframes import constants + def concat( objs, @@ -135,4 +137,4 @@ def concat( [4 rows x 2 columns] """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/tile.py b/third_party/bigframes_vendored/pandas/core/reshape/tile.py index 9381ad4552..4f5f2efef0 100644 --- a/third_party/bigframes_vendored/pandas/core/reshape/tile.py +++ b/third_party/bigframes_vendored/pandas/core/reshape/tile.py @@ -4,6 +4,8 @@ """ from __future__ import annotations +from bigframes import constants + def cut( x, @@ -62,4 +64,4 @@ def cut( are whatever the type in the sequence is. False : returns an ndarray of integers. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 79eb402696..8d505c1ead 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -10,6 +10,7 @@ from pandas._typing import Axis, FilePath, NaPosition, WriteBuffer import pandas.io.formats.format as fmt +from bigframes import constants from third_party.bigframes_vendored.pandas.core.generic import NDFrame if TYPE_CHECKING: @@ -23,31 +24,31 @@ def dt(self): """ Accessor object for datetime-like properties of the Series values. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def index(self): """The index (axis labels) of the Series.""" - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def shape(self): """Return a tuple of the shape of the underlying data.""" - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def dtype(self): """ Return the dtype object of the underlying data. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def dtypes(self): """ Return the dtype object of the underlying data. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def name(self) -> Hashable: @@ -62,7 +63,7 @@ def name(self) -> Hashable: hashable object: The name of the Series, also the column name if part of a DataFrame. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reset_index( self, @@ -94,13 +95,13 @@ def reset_index( In either case, if ``inplace=True``, no value is returned. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __repr__(self) -> str: """ Return a string representation for a particular Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # IO methods (to / from other formats) @@ -165,7 +166,7 @@ def to_string( result = formatter.to_string() # catch contract violations - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_markdown( self, @@ -188,7 +189,7 @@ def to_markdown( Returns: str: {klass} in Markdown-friendly format. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_dict(self, into: type[dict] = dict) -> Mapping: """ @@ -204,7 +205,7 @@ def to_dict(self, into: type[dict] = dict) -> Mapping: Returns: collections.abc.Mapping: Key-value representation of Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_frame(self) -> DataFrame: """ @@ -213,7 +214,7 @@ def to_frame(self) -> DataFrame: Returns: bigframes.dataframe.DataFrame: DataFrame representation of Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_excel(self, excel_writer, sheet_name): """ @@ -235,7 +236,7 @@ def to_excel(self, excel_writer, sheet_name): sheet_name (str, default 'Sheet1'): Name of sheet to contain Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): """ @@ -256,7 +257,7 @@ def to_latex(self, buf=None, columns=None, header=True, index=True, **kwargs): str or None: If buf is None, returns the result as a string. Otherwise returns None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def tolist(self) -> list: """ @@ -269,7 +270,7 @@ def tolist(self) -> list: Returns: list: list of the values """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) to_list = tolist @@ -296,7 +297,7 @@ def to_numpy(self, dtype, copy=False, na_value=None): numpy.ndarray: A NumPy ndarray representing the values in this Series or Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_pickle(self, path, **kwargs): """ @@ -308,7 +309,7 @@ def to_pickle(self, path, **kwargs): object implementing a binary ``write()`` function. File path where the pickled object will be stored. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_xarray(self): """ @@ -319,7 +320,7 @@ def to_xarray(self): converted to Dataset if the object is a DataFrame, or a DataArray if the object is a Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_json( self, @@ -354,7 +355,7 @@ def to_json( None or str: If path_or_buf is None, returns the resulting json format as a string. Otherwise returns None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: """ @@ -372,7 +373,7 @@ def to_csv(self, path_or_buf: str, *, index: bool = True) -> str | None: None or str: If path_or_buf is None, returns the resulting csv format as a string. Otherwise returns None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def agg(self, func): """ @@ -387,7 +388,7 @@ def agg(self, func): Returns: scalar or Series: Aggregated results """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def count(self): """ @@ -397,7 +398,7 @@ def count(self): int or Series (if level specified): Number of non-null values in the Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nunique(self) -> int: """ @@ -408,7 +409,7 @@ def nunique(self) -> int: Returns: int: number of unique elements in the object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mode(self) -> Series: """ @@ -421,7 +422,7 @@ def mode(self) -> Series: Returns: bigframes.series.Series: Modes of the Series in sorted order. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def drop_duplicates( self, @@ -442,7 +443,7 @@ def drop_duplicates( Returns: bigframes.series.Series: Series with duplicates dropped or None if ``inplace=True``. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def duplicated(self, keep="first") -> Series: """ @@ -466,7 +467,7 @@ def duplicated(self, keep="first") -> Series: bigframes.series.Series: Series indicating whether each value has occurred in the preceding values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def round(self, decimals: int = 0) -> Series: """ @@ -480,6 +481,29 @@ def round(self, decimals: int = 0) -> Series: Returns: bigframes.series.Series: Rounded values of the Series. """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def corr(self, other, method="pearson", min_periods=None) -> float: + """ + Compute the correlation with the other Series. Non-number values are ignored in the + computation. + + Uses the "Pearson" method of correlation. Numbers are converted to float before + calculation, so the result may be unstable. + + Args: + other (Series): + The series with which this is to be correlated. + method (string, default "pearson"): + Correlation method to use - currently only "pearson" is supported. + min_periods (int, default None): + The minimum number of observations needed to return a result. Non-default values + are not yet supported, so a result will be returned for at least two observations. + + Returns: + float; Will return NaN if there are fewer than two numeric pairs, either series has a + variance or covariance of zero, or any input value is infinite. + """ raise NotImplementedError("abstract method") def diff(self) -> Series: @@ -497,7 +521,7 @@ def diff(self) -> Series: Returns: {klass}: First differences of the Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def dot(self, other) -> Series | np.ndarray: """ @@ -527,19 +551,19 @@ def dot(self, other) -> Series | np.ndarray: """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __matmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def __rmatmul__(self, other): """ Matrix multiplication using binary `@` operator in Python>=3.5. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_values( self, @@ -571,7 +595,7 @@ def sort_values( Returns: bigframes.series.Series: Series ordered by values or None if ``inplace=True``. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sort_index( self, @@ -602,7 +626,7 @@ def sort_index( """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nlargest( self, n: int = 5, keep: Literal["first", "last", "all"] = "first" @@ -626,7 +650,7 @@ def nlargest( Returns: bigframes.series.Series: The `n` largest values in the Series, sorted in decreasing order. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def nsmallest(self, n: int = 5, keep: str = "first") -> Series: """ @@ -649,7 +673,7 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: Returns: bigframes.series.Series: The `n` smallest values in the Series, sorted in increasing order. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) # ---------------------------------------------------------------------- # function application @@ -672,7 +696,7 @@ def apply( bigframes.series.Series: If func returns a Series object the result will be a DataFrame. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def groupby( self, @@ -726,7 +750,7 @@ def groupby( bigframes.core.groupby.SeriesGroupBy: Returns a groupby object that contains information about the groups. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def drop( self, labels=None, *, axis=0, index=None, columns=None, level=None @@ -758,7 +782,7 @@ def drop( Raises: KeyError: If none of the labels are found in the index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def reorder_levels(self, order: Sequence) -> Series: """ @@ -773,7 +797,7 @@ def reorder_levels(self, order: Sequence) -> Series: Returns: type of caller (new object) """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def droplevel(self, level): """ @@ -788,7 +812,7 @@ def droplevel(self, level): Returns: Series with requested index / column level(s) removed. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def fillna( self, @@ -804,7 +828,7 @@ def fillna( Returns: Series or None: Object with missing values filled or None. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def between( self, @@ -832,7 +856,7 @@ def between( right (inclusive). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumprod(self): """ @@ -844,7 +868,7 @@ def cumprod(self): Returns: bigframes.series.Series: Return cumulative sum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cumsum(self): """ @@ -861,7 +885,7 @@ def cumsum(self): Returns: scalar or Series: Return cumulative sum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummax(self): """ @@ -878,7 +902,7 @@ def cummax(self): Returns: bigframes.series.Series: Return cumulative maximum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cummin(self): """ @@ -901,7 +925,7 @@ def cummin(self): Returns: bigframes.series.Series: Return cumulative minimum of scalar or Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def eq(self, other) -> Series: """Return equal of Series and other, element-wise (binary operator eq). @@ -916,7 +940,7 @@ def eq(self, other) -> Series: Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ne(self, other) -> Series: """Return not equal of Series and other, element-wise (binary operator ne). @@ -931,7 +955,7 @@ def ne(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def le(self, other) -> Series: """Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`). @@ -946,7 +970,7 @@ def le(self, other) -> Series: bigframes.series.Series. The result of the comparison. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lt(self, other) -> Series: """Get 'less than' of Series and other, element-wise (binary operator `<`). @@ -961,7 +985,7 @@ def lt(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ge(self, other) -> Series: """Get 'greater than or equal to' of Series and other, element-wise (binary operator `>=`). @@ -976,7 +1000,7 @@ def ge(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def gt(self, other) -> Series: """Get 'less than or equal to' of Series and other, element-wise (binary operator `<=`). @@ -991,7 +1015,7 @@ def gt(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def add(self, other) -> Series: """Return addition of Series and other, element-wise (binary operator add). @@ -1006,7 +1030,7 @@ def add(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def radd(self, other) -> Series: """Return addition of Series and other, element-wise (binary operator radd). @@ -1021,7 +1045,7 @@ def radd(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sub( self, @@ -1039,7 +1063,7 @@ def sub( bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rsub(self, other) -> Series: """Return subtraction of Series and other, element-wise (binary operator rsub). @@ -1054,7 +1078,7 @@ def rsub(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -1069,7 +1093,7 @@ def mul(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmul(self, other) -> Series: """Return multiplication of Series and other, element-wise (binary operator mul). @@ -1083,7 +1107,7 @@ def rmul(self, other) -> Series: Returns: Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def truediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator truediv). @@ -1098,7 +1122,7 @@ def truediv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rtruediv(self, other) -> Series: """Return floating division of Series and other, element-wise (binary operator rtruediv). @@ -1113,7 +1137,7 @@ def rtruediv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def floordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator floordiv). @@ -1128,7 +1152,7 @@ def floordiv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rfloordiv(self, other) -> Series: """Return integer division of Series and other, element-wise (binary operator rfloordiv). @@ -1143,7 +1167,7 @@ def rfloordiv(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mod(self, other) -> Series: """Return modulo of Series and other, element-wise (binary operator mod). @@ -1158,7 +1182,7 @@ def mod(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmod(self, other) -> Series: """Get modulo of Series and other, element-wise (binary operator `rmod`). @@ -1173,7 +1197,7 @@ def rmod(self, other) -> Series: bigframes.series.Series: The result of the operation. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def divmod(self, other) -> Series: """Return integer division and modulo of Series and other, element-wise (binary operator divmod). @@ -1188,7 +1212,7 @@ def divmod(self, other) -> Series: consistent with (floordiv, mod) (though pandas may not). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rdivmod(self, other) -> Series: """Return integer division and modulo of Series and other, element-wise (binary operator rdivmod). @@ -1203,7 +1227,7 @@ def rdivmod(self, other) -> Series: consistent with (rfloordiv, rmod) (though pandas may not). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def all( self, @@ -1218,7 +1242,7 @@ def all( scalar or Series: If level is specified, then, Series is returned; otherwise, scalar is returned. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def any( self, @@ -1233,7 +1257,7 @@ def any( scalar or Series: If level is specified, then, Series is returned; otherwise, scalar is returned. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max( self, @@ -1248,7 +1272,7 @@ def max( Returns: scalar or scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min( self, @@ -1262,7 +1286,7 @@ def min( Returns: scalar or scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std( self, @@ -1277,7 +1301,7 @@ def std( ------- scalar or Series (if level specified) """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var( self, @@ -1290,7 +1314,7 @@ def var( Returns: scalar or Series (if level specified) """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum(self): """Return the sum of the values over the requested axis. @@ -1300,7 +1324,7 @@ def sum(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean(self): """Return the mean of the values over the requested axis. @@ -1308,7 +1332,7 @@ def mean(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def median(self, *, exact: bool = False): """Return the median of the values over the requested axis. @@ -1321,7 +1345,7 @@ def median(self, *, exact: bool = False): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def prod(self): """Return the product of the values over the requested axis. @@ -1329,7 +1353,7 @@ def prod(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def skew(self): """Return unbiased skew over requested axis. @@ -1339,7 +1363,7 @@ def skew(self): Returns: scalar """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def kurt(self): """Return unbiased kurtosis over requested axis. @@ -1349,7 +1373,7 @@ def kurt(self): Returns: scalar or scalar: Unbiased kurtosis over requested axis. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def where(self, cond, other): """Replace values where the condition is False. @@ -1373,7 +1397,7 @@ def where(self, cond, other): Returns: bigframes.series.Series: Series after the replacement. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mask(self, cond, other): """Replace values where the condition is True. @@ -1397,7 +1421,7 @@ def mask(self, cond, other): Returns: bigframes.series.Series: Series after the replacement. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def clip(self): """Trim values at input threshold(s). @@ -1416,7 +1440,7 @@ def clip(self): Returns: Series. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def argmax(self): """ @@ -1427,7 +1451,7 @@ def argmax(self): Returns: Series: Row position of the maximum value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def argmin(self): """ @@ -1438,7 +1462,7 @@ def argmin(self): Returns: Series: Row position of the minimum value. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename(self, index, **kwargs) -> Series | None: """ @@ -1461,7 +1485,7 @@ def rename(self, index, **kwargs) -> Series | None: bigframes.series.Series: Series with index labels. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rename_axis(self, mapper, **kwargs): """ @@ -1474,7 +1498,7 @@ def rename_axis(self, mapper, **kwargs): Returns: bigframes.series.Series: Series with the name of the axis set. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rolling( self, @@ -1515,7 +1539,7 @@ def rolling( bigframes.core.window.Window: ``Window`` subclass if a ``win_type`` is passed. ``Rolling`` subclass if ``win_type`` is not passed. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def expanding(self, min_periods=1): """ @@ -1529,7 +1553,7 @@ def expanding(self, min_periods=1): Returns: bigframes.core.window.Window: ``Expanding`` subclass. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def value_counts( self, @@ -1560,7 +1584,7 @@ def value_counts( Returns: Series: Series containing counts of unique values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def str(self): @@ -1570,7 +1594,7 @@ def str(self): NAs stay NA unless handled otherwise by a particular method. Patterned after Python’s string methods, with some inspiration from R’s stringr package. """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isin(self, values): """ @@ -1595,7 +1619,7 @@ def isin(self, values): Raises: TypeError: If input is not list-like. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def is_monotonic_increasing(self) -> bool: @@ -1605,7 +1629,7 @@ def is_monotonic_increasing(self) -> bool: Returns: bool """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def is_monotonic_decreasing(self) -> bool: @@ -1615,4 +1639,4 @@ def is_monotonic_decreasing(self) -> bool: Returns: bool """ - raise NotImplementedError("abstract property") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/strings/accessor.py b/third_party/bigframes_vendored/pandas/core/strings/accessor.py index 5e3d0b047f..ecdd9547d5 100644 --- a/third_party/bigframes_vendored/pandas/core/strings/accessor.py +++ b/third_party/bigframes_vendored/pandas/core/strings/accessor.py @@ -1,6 +1,8 @@ import re import typing +from bigframes import constants + class StringMethods: """ @@ -32,7 +34,7 @@ def extract(self, pat: str, flags: int = 0): expression pat will be used for column names; otherwise capture group numbers will be used. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def find(self, sub, start: int = 0, end=None): """Return lowest indexes in each strings in the Series/Index. @@ -52,7 +54,7 @@ def find(self, sub, start: int = 0, end=None): Returns: bigframes.series.Series: Series with lowest indexes in each strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def len(self): """Compute the length of each element in the Series/Index. @@ -65,7 +67,7 @@ def len(self): the length of each element in the Series or Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lower(self): """Convert strings in the Series/Index to lowercase. @@ -76,7 +78,7 @@ def lower(self): bigframes.series.Series: Series with lowercase. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def slice(self, start=None, stop=None): """Slice substrings from each element in the Series or Index. @@ -94,7 +96,7 @@ def slice(self, start=None, stop=None): substring from original string object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def strip(self): """Remove leading and trailing characters. @@ -109,7 +111,7 @@ def strip(self): and trailing characters. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def upper(self): """Convert strings in the Series/Index to uppercase. @@ -120,7 +122,7 @@ def upper(self): bigframes.series.Series: Series with uppercase strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isnumeric(self): """Check whether all characters in each string are numeric. @@ -134,7 +136,7 @@ def isnumeric(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isalpha(self): """Check whether all characters in each string are alphabetic. @@ -147,7 +149,7 @@ def isalpha(self): bigframes.series.Series: Series with the same length as the originalSeries/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isdigit(self): """Check whether all characters in each string are digits. @@ -160,7 +162,7 @@ def isdigit(self): bigframes.series.Series: Series with the same length as the originalSeries/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isalnum(self): """Check whether all characters in each string are alphanumeric. @@ -174,7 +176,7 @@ def isalnum(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isspace(self): """Check whether all characters in each string are whitespace. @@ -188,7 +190,7 @@ def isspace(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def islower(self): """Check whether all characters in each string are lowercase. @@ -202,7 +204,7 @@ def islower(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isupper(self): """Check whether all characters in each string are uppercase. @@ -216,7 +218,7 @@ def isupper(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def isdecimal(self): """Check whether all characters in each string are decimal. @@ -230,7 +232,7 @@ def isdecimal(self): same length as the original Series/Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rstrip(self): """Remove trailing characters. @@ -244,7 +246,7 @@ def rstrip(self): bigframes.series.Series: Series without trailing characters. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def lstrip(self): """Remove leading characters. @@ -258,7 +260,7 @@ def lstrip(self): bigframes.series.Series: Series without leading characters. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def repeat(self, repeats: int): """Duplicate each string in the Series or Index. @@ -272,7 +274,7 @@ def repeat(self, repeats: int): objects specified by input parameter repeats. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def capitalize(self): """Convert strings in the Series/Index to be capitalized. @@ -283,7 +285,7 @@ def capitalize(self): bigframes.series.Series: Series with captitalized strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def cat(self, others, *, join): """Concatenate strings in the Series/Index with given separator. @@ -304,7 +306,7 @@ def cat(self, others, *, join): bigframes.series.Series: Series with concatenated strings. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True): """ @@ -329,7 +331,7 @@ def contains(self, pat, case: bool = True, flags: int = 0, *, regex: bool = True whether the given pattern is contained within the string of each element of the Series or Index. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def replace( self, @@ -373,7 +375,7 @@ def replace( of `pat` replaced by `repl`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def startswith( self, @@ -391,7 +393,7 @@ def startswith( bigframes.series.Series: A Series of booleans indicating whether the given pattern matches the start of each string element. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def endswith( self, @@ -409,7 +411,7 @@ def endswith( bigframes.series.Series: A Series of booleans indicating whether the given pattern matches the end of each string element. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def match(self, pat: str, case: bool = True, flags: int = 0): """ @@ -426,7 +428,7 @@ def match(self, pat: str, case: bool = True, flags: int = 0): Returns: bigframes.series.Series: Series of boolean values """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def fullmatch(self, pat: str, case: bool = True, flags: int = 0): """ @@ -443,7 +445,7 @@ def fullmatch(self, pat: str, case: bool = True, flags: int = 0): Returns: bigframes.series.Series: Series of boolean values """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def get(self, i: int): """ @@ -459,7 +461,7 @@ def get(self, i: int): Returns: bigframes.series.Series: Series """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def pad( self, @@ -482,7 +484,7 @@ def pad( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def ljust( self, @@ -502,7 +504,7 @@ def ljust( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rjust( self, @@ -522,7 +524,7 @@ def rjust( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def zfill( self, @@ -544,7 +546,7 @@ def zfill( Returns: bigframes.series.Series: Series of objects. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def center( self, @@ -566,4 +568,4 @@ def center( Returns: bigframes.series.Series: Returns Series or Index with minimum number of char in object. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/window/rolling.py b/third_party/bigframes_vendored/pandas/core/window/rolling.py index 7a9239b70c..a869c86e72 100644 --- a/third_party/bigframes_vendored/pandas/core/window/rolling.py +++ b/third_party/bigframes_vendored/pandas/core/window/rolling.py @@ -4,34 +4,36 @@ similar to how we have a Groupby object. """ +from bigframes import constants + class Window: """Provide window calculations.""" def count(self): """Calculate the window count of non-NULL observations.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def sum(self): """Calculate the weighted window sum.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def mean(self): """Calculate the weighted window mean.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def var(self): """Calculate the weighted window variance.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def std(self): """Calculate the weighted window standard deviation.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def max(self): """Calculate the weighted window maximum.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def min(self): """Calculate the weighted window minimum.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/gbq.py b/third_party/bigframes_vendored/pandas/io/gbq.py index 730872034d..95531ff5e8 100644 --- a/third_party/bigframes_vendored/pandas/io/gbq.py +++ b/third_party/bigframes_vendored/pandas/io/gbq.py @@ -5,6 +5,8 @@ from typing import Iterable, Optional +from bigframes import constants + class GBQIOMixin: def read_gbq( @@ -83,4 +85,4 @@ def read_gbq( Returns: bigframes.dataframe.DataFrame: A DataFrame representing results of the query or table. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/parquet.py b/third_party/bigframes_vendored/pandas/io/parquet.py index 6f0a2b3cb4..9aed9af5a8 100644 --- a/third_party/bigframes_vendored/pandas/io/parquet.py +++ b/third_party/bigframes_vendored/pandas/io/parquet.py @@ -2,6 +2,8 @@ """ parquet compat """ from __future__ import annotations +from bigframes import constants + class ParquetIOMixin: def read_parquet( @@ -22,4 +24,4 @@ def read_parquet( Returns: bigframes.dataframe.DataFrame: A BigQuery DataFrames. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/parsers/readers.py b/third_party/bigframes_vendored/pandas/io/parsers/readers.py index e01eb734fb..d19a92ecdf 100644 --- a/third_party/bigframes_vendored/pandas/io/parsers/readers.py +++ b/third_party/bigframes_vendored/pandas/io/parsers/readers.py @@ -6,10 +6,22 @@ """ from __future__ import annotations -from typing import Any, Dict, Literal, MutableSequence, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Dict, + IO, + Literal, + MutableSequence, + Optional, + Sequence, + Tuple, + Union, +) import numpy as np +from bigframes import constants + class ReaderIOMixin: def read_csv( @@ -45,7 +57,8 @@ def read_csv( Args: filepath_or_buffer (str): - a string path including Cloud Storage and local file. + A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` + otherwise passed to pandas.read_csv. sep (Optional[str], default ","): the separator for fields in a CSV file. For the BigQuery engine, the separator can be any ISO-8859-1 single-byte character. To use a character in the range @@ -104,10 +117,71 @@ def read_csv( https://docs.python.org/3/library/codecs.html#standard-encodings The BigQuery engine only supports `UTF-8` and `ISO-8859-1`. **kwargs: - keyword arguments. + keyword arguments for `pandas.read_csv` when not using the BigQuery engine. Returns: bigframes.dataframe.DataFrame: A BigQuery DataFrames. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def read_json( + self, + path_or_buf: str | IO["bytes"], + *, + orient: Literal[ + "split", "records", "index", "columns", "values", "table" + ] = "columns", + dtype: Optional[Dict] = None, + encoding: Optional[str] = None, + lines: bool = False, + engine: Literal["ujson", "pyarrow", "bigquery"] = "ujson", + **kwargs, + ): + """ + Convert a JSON string to DataFrame object. + + .. note:: + using `engine="bigquery"` will not guarantee the same ordering as the + file. Instead, set a serialized index column as the index and sort by + that in the resulting DataFrame. + + Args: + path_or_buf (a valid JSON str, path object or file-like object): + A local or Google Cloud Storage (`gs://`) path with `engine="bigquery"` + otherwise passed to pandas.read_json. + orient (str, optional): + If `engine="bigquery"` orient only supports "records". + Indication of expected JSON string format. + Compatible JSON strings can be produced by ``to_json()`` with a + corresponding orient value. + The set of possible orients is: + + - ``'split'`` : dict like + ``{{index -> [index], columns -> [columns], data -> [values]}}`` + - ``'records'`` : list like + ``[{{column -> value}}, ... , {{column -> value}}]`` + - ``'index'`` : dict like ``{{index -> {{column -> value}}}}`` + - ``'columns'`` : dict like ``{{column -> {{index -> value}}}}`` + - ``'values'`` : just the values array + + dtype (bool or dict, default None): + If True, infer dtypes; if a dict of column to dtype, then use those; + if False, then don't infer dtypes at all, applies only to the data. + + For all ``orient`` values except ``'table'``, default is True. + encoding (str, default is 'utf-8'): + The encoding to use to decode py3 bytes. + lines (bool, default False): + Read the file as a json object per line. If using `engine="bigquery"` lines only supports True. + engine ({{"ujson", "pyarrow", "bigquery"}}, default "ujson"): + Type of engine to use. If `engine="bigquery"` is specified, then BigQuery's load API will be used. + Otherwise, the engine will be passed to `pandas.read_json`. + **kwargs: + keyword arguments for `pandas.read_json` when not using the BigQuery engine. + + Returns: + bigframes.dataframe.DataFrame: + The DataFrame representing JSON contents. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/io/pickle.py b/third_party/bigframes_vendored/pandas/io/pickle.py index a160ef0c4e..71b31956a0 100644 --- a/third_party/bigframes_vendored/pandas/io/pickle.py +++ b/third_party/bigframes_vendored/pandas/io/pickle.py @@ -9,6 +9,8 @@ StorageOptions, ) +from bigframes import constants + class PickleIOMixin: def read_pickle( @@ -52,4 +54,4 @@ def read_pickle( bigframes.dataframe.DataFrame or bigframes.series.Series: same type as object stored in file. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/base.py b/third_party/bigframes_vendored/sklearn/base.py index fc48cde85b..847ad06f75 100644 --- a/third_party/bigframes_vendored/sklearn/base.py +++ b/third_party/bigframes_vendored/sklearn/base.py @@ -9,6 +9,8 @@ import inspect from typing import Any, Dict, List +from bigframes import constants + class BaseEstimator: """Base class for all estimators. @@ -94,7 +96,7 @@ def score(self, X, y): Returns: bigframes.dataframe.DataFrame: A DataFrame of the evaluation result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class RegressorMixin: @@ -120,7 +122,7 @@ def score(self, X, y): Returns: bigframes.dataframe.DataFrame: A DataFrame of the evaluation result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class TransformerMixin: diff --git a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py index ea4df0dc02..ff1c04edbe 100644 --- a/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py +++ b/third_party/bigframes_vendored/sklearn/cluster/_kmeans.py @@ -14,6 +14,7 @@ from abc import ABC from typing import List, Optional +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -32,7 +33,7 @@ def predict(self, X): bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the class labels for each sample. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class KMeans(_BaseKMeans): @@ -65,7 +66,7 @@ def fit( Returns: KMeans: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict( self, @@ -80,7 +81,7 @@ def predict( Returns: bigframes.dataframe.DataFrame: DataFrame of the cluster each sample belongs to. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score( self, @@ -98,7 +99,7 @@ def score( Returns: bigframes.dataframe.DataFrame: DataFrame of the metrics. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) @property def cluster_centers_(self): @@ -119,4 +120,4 @@ def cluster_centers_(self): The output contains one row per feature per centroid. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py index bc8bc3980a..dead173b2d 100644 --- a/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py +++ b/third_party/bigframes_vendored/sklearn/compose/_column_transformer.py @@ -6,6 +6,7 @@ from abc import ABCMeta +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -43,7 +44,7 @@ def fit( Returns: ColumnTransformer: Fitted estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transform( self, @@ -58,4 +59,4 @@ def transform( Returns: bigframes.dataframe.DataFrame: Transformed result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py index 619c13f35d..85feab0024 100644 --- a/third_party/bigframes_vendored/sklearn/decomposition/_pca.py +++ b/third_party/bigframes_vendored/sklearn/decomposition/_pca.py @@ -12,6 +12,7 @@ from abc import ABCMeta +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -55,7 +56,7 @@ def fit(self, X, y=None): Returns: PCA: Fitted estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X=None, y=None): """Return the metrics of the model. @@ -69,7 +70,7 @@ def score(self, X=None, y=None): Returns: bigframes.dataframe.DataFrame: DataFrame that represents model metrics. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict(self, X): """Predict the closest cluster for each sample in X. @@ -80,4 +81,51 @@ def predict(self, X): Returns: bigframes.dataframe.DataFrame: predicted DataFrames.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def components_(self): + """Principal axes in feature space, representing the directions of maximum variance in the data. + + Returns: + bigframes.dataframe.DataFrame: DataFrame of principal components, containing following columns: + principal_component_id: An integer that identifies the principal component. + + feature: The column name that contains the feature. + + numerical_value: If feature is numeric, the value of feature for the principal component that principal_component_id identifies. If feature isn't numeric, the value is NULL. + + categorical_value: An list of mappings containing information about categorical features. Each mapping contains the following fields: + categorical_value.category: The name of each category. + + categorical_value.value: The value of categorical_value.category for the centroid that centroid_id identifies. + + The output contains one row per feature per component. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def explained_variance_(self): + """The amount of variance explained by each of the selected components. + + Returns: + bigframes.dataframe.DataFrame: DataFrame containing following columns: + principal_component_id: An integer that identifies the principal component. + + explained_variance: The factor by which the eigenvector is scaled. Eigenvalue and explained variance are the same concepts in PCA. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + @property + def explained_variance_ratio_(self): + """Percentage of variance explained by each of the selected components. + + Returns: + bigframes.dataframe.DataFrame: DataFrame containing following columns: + principal_component_id: An integer that identifies the principal component. + + explained_variance_ratio: the total variance is the sum of variances, also known as eigenvalues, of all + of the individual principal components. The explained variance ratio by a principal component is + the ratio between the variance, also known as eigenvalue, of that principal component and the total variance. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py index 73f4684dc3..79224a772d 100644 --- a/third_party/bigframes_vendored/sklearn/ensemble/_forest.py +++ b/third_party/bigframes_vendored/sklearn/ensemble/_forest.py @@ -33,6 +33,8 @@ class calls the ``fit`` method of each sub-estimator on random samples from abc import ABCMeta +from bigframes import constants + from ..base import BaseEstimator, ClassifierMixin, MetaEstimatorMixin, RegressorMixin @@ -60,7 +62,7 @@ def fit(self, X, y): Returns: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): @@ -82,7 +84,7 @@ def predict(self, X): Returns: The predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class RandomForestRegressor(ForestRegressor): @@ -148,7 +150,7 @@ def predict(self, X): Returns: The predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class RandomForestClassifier(ForestClassifier): diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_base.py b/third_party/bigframes_vendored/sklearn/linear_model/_base.py index 65e895298d..8141da4e3b 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_base.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_base.py @@ -18,6 +18,7 @@ from abc import ABCMeta from typing import List, Optional +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import ( BaseEstimator, ClassifierMixin, @@ -36,7 +37,7 @@ def predict(self, X): Returns: bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,). Returns predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class LinearClassifierMixin(ClassifierMixin): @@ -52,7 +53,7 @@ def predict(self, X): bigframes.dataframe.DataFrame: DataFrame of shape (n_samples,), containing the class labels for each sample. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class LinearRegression(RegressorMixin, LinearModel): @@ -92,4 +93,4 @@ def fit( Returns: LinearRegression: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py index 8525e57068..a06035eef6 100644 --- a/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py +++ b/third_party/bigframes_vendored/sklearn/linear_model/_logistic.py @@ -14,6 +14,7 @@ from typing import List, Optional +from bigframes import constants from third_party.bigframes_vendored.sklearn.linear_model._base import ( BaseEstimator, LinearClassifierMixin, @@ -57,4 +58,4 @@ def fit( Returns: LogisticRegression: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_classification.py b/third_party/bigframes_vendored/sklearn/metrics/_classification.py index 6d9692ac8d..a9d8038e59 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_classification.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_classification.py @@ -20,6 +20,8 @@ # Michal Karbownik # License: BSD 3 clause +from bigframes import constants + def accuracy_score(y_true, y_pred, normalize=True) -> float: """Accuracy classification score. @@ -39,7 +41,7 @@ def accuracy_score(y_true, y_pred, normalize=True) -> float: classified samples (float), else returns the number of correctly classified samples (int). """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def confusion_matrix( @@ -68,7 +70,7 @@ def confusion_matrix( samples with true label being i-th class and predicted label being j-th class. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def recall_score( @@ -99,7 +101,7 @@ def recall_score( of the positive class in binary classification or weighted average of the recall of each class for the multiclass task. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def precision_score( @@ -132,7 +134,7 @@ def precision_score( Precision of the positive class in binary classification or weighted average of the precision of each class for the multiclass task. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def f1_score( @@ -167,4 +169,4 @@ def f1_score( average of the F1 scores of each class for the multiclass task. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py index 693996070f..ac919edbe3 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_ranking.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_ranking.py @@ -16,6 +16,8 @@ # Michal Karbownik # License: BSD 3 clause +from bigframes import constants + def auc(x, y) -> float: """Compute Area Under the Curve (AUC) using the trapezoidal rule. @@ -35,7 +37,7 @@ def auc(x, y) -> float: Returns: float: Area Under the Curve. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def roc_auc_score(y_true, y_score) -> float: @@ -60,7 +62,7 @@ def roc_auc_score(y_true, y_score) -> float: Returns: float: Area Under the Curve score. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def roc_curve( @@ -95,4 +97,4 @@ def roc_curve( fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `max(y_score) + 1`. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/metrics/_regression.py b/third_party/bigframes_vendored/sklearn/metrics/_regression.py index b90c415887..9740c540e9 100644 --- a/third_party/bigframes_vendored/sklearn/metrics/_regression.py +++ b/third_party/bigframes_vendored/sklearn/metrics/_regression.py @@ -24,6 +24,8 @@ # Ohad Michel # License: BSD 3 clause +from bigframes import constants + def r2_score(y_true, y_pred, force_finite=True) -> float: """:math:`R^2` (coefficient of determination) regression score function. @@ -49,4 +51,4 @@ def r2_score(y_true, y_pred, force_finite=True) -> float: Returns: float: The :math:`R^2` score. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/pipeline.py b/third_party/bigframes_vendored/sklearn/pipeline.py index f8bbae86df..4b8eb25a97 100644 --- a/third_party/bigframes_vendored/sklearn/pipeline.py +++ b/third_party/bigframes_vendored/sklearn/pipeline.py @@ -11,6 +11,7 @@ from abc import ABCMeta +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -47,7 +48,7 @@ def fit( Returns: Pipeline: Pipeline with fitted steps. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def score(self, X, y): @@ -67,7 +68,7 @@ def score(self, X, y): DataFrame: A DataFrame representing the result of calling `score` on the final estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def predict(self, X): @@ -81,4 +82,4 @@ def predict(self, X): bigframes.dataframe.DataFrame: A Dataframe representing predicted result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py index c57d1f2230..d013043467 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_data.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_data.py @@ -7,6 +7,7 @@ # Eric Chang # License: BSD 3 clause +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -59,7 +60,7 @@ def fit(self, X): Returns: StandardScaler: Fitted scaler. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transform(self, X): """Perform standardization by centering and scaling. @@ -71,4 +72,4 @@ def transform(self, X): Returns: bigframes.dataframe.DataFrame: Transformed result. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py index a6c32d91c1..b1cf17e539 100644 --- a/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py +++ b/third_party/bigframes_vendored/sklearn/preprocessing/_encoder.py @@ -2,6 +2,7 @@ # Joris Van den Bossche # License: BSD 3 clause +from bigframes import constants from third_party.bigframes_vendored.sklearn.base import BaseEstimator @@ -61,7 +62,7 @@ def fit(self, X): Returns: OneHotEncoder: Fitted encoder. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def transform(self, X): """Transform X using one-hot encoding. @@ -73,4 +74,4 @@ def transform(self, X): Returns: bigframes.dataframe.DataFrame: The result is categorized as index: number, value: number. Where index is the position of the dict that seeing the category, and value is 0 or 1.""" - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/xgboost/sklearn.py b/third_party/bigframes_vendored/xgboost/sklearn.py index fcb5d2ec59..620c87fa3d 100644 --- a/third_party/bigframes_vendored/xgboost/sklearn.py +++ b/third_party/bigframes_vendored/xgboost/sklearn.py @@ -2,6 +2,8 @@ from typing import Any +from bigframes import constants + from ..sklearn.base import BaseEstimator as XGBModelBase from ..sklearn.base import ClassifierMixin as XGBClassifierBase from ..sklearn.base import RegressorMixin as XGBRegressorBase @@ -18,7 +20,7 @@ def predict(self, X): Returns: DataFrame of shape (n_samples,): Returns predicted values. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def fit(self, X, y): """Fit gradient boosting model. @@ -42,7 +44,7 @@ def fit(self, X, y): Returns: XGBModel: Fitted Estimator. """ - raise NotImplementedError("abstract method") + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) class XGBClassifierMixIn: From 7011a127722514fc562ff5a7ee259a5904f8c9ab Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Tue, 29 Aug 2023 11:40:36 -0500 Subject: [PATCH 3/7] chore: add templated GitHub config files (#12) Pulled manually from https://github.com/googleapis/synthtool/tree/master/synthtool/gcp/templates/python_library/.github Change-Id: I7f36912dffb427af2cd388abb8109670ac162701 --- .github/CODEOWNERS | 11 ++++ .github/CONTRIBUTING.md | 28 ++++++++++ .github/ISSUE_TEMPLATE/bug_report.md | 43 ++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 18 ++++++ .github/ISSUE_TEMPLATE/support_request.md | 7 +++ .github/PULL_REQUEST_TEMPLATE.md | 7 +++ .github/auto-approve.yml | 3 + .github/auto-label.yaml | 15 +++++ .github/header-checker-lint.yml | 15 +++++ .github/release-please.yml | 2 + .github/release-trigger.yml | 1 + .github/snippet-bot.yml | 0 .github/sync-repo-settings.yaml | 31 +++++++++++ .github/workflows/docs.yml | 38 +++++++++++++ .github/workflows/lint.yml | 25 +++++++++ .github/workflows/unittest.yml | 57 +++++++++++++++++++ SECURITY.md | 7 +++ owlbot.py | 68 +++++++++++++++++++++++ 18 files changed, 376 insertions(+) create mode 100644 .github/CODEOWNERS create mode 100644 .github/CONTRIBUTING.md create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/ISSUE_TEMPLATE/support_request.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/auto-approve.yml create mode 100644 .github/auto-label.yaml create mode 100644 .github/header-checker-lint.yml create mode 100644 .github/release-please.yml create mode 100644 .github/release-trigger.yml create mode 100644 .github/snippet-bot.yml create mode 100644 .github/sync-repo-settings.yaml create mode 100644 .github/workflows/docs.yml create mode 100644 .github/workflows/lint.yml create mode 100644 .github/workflows/unittest.yml create mode 100644 SECURITY.md create mode 100644 owlbot.py diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000..cd904459a9 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,11 @@ +# Code owners file. +# This file controls who is tagged for review for any given pull request. +# +# For syntax help see: +# https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners#codeowners-syntax +# Note: This file is autogenerated. To make changes to the codeowner team, please update .repo-metadata.json. +# @googleapis/yoshi-python @googleapis/api-bigquery-dataframe are the default owners for changes in this repo +* @googleapis/yoshi-python @googleapis/api-bigquery-dataframe + +# @googleapis/python-samples-reviewers @googleapis/api-bigquery-dataframe are the default owners for samples changes +/samples/ @googleapis/python-samples-reviewers @googleapis/api-bigquery-dataframe diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000000..939e5341e7 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,28 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows [Google's Open Source Community +Guidelines](https://opensource.google.com/conduct/). diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000000..7b0900728e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,43 @@ +--- +name: Bug report +about: Create a report to help us improve + +--- + +Thanks for stopping by to let us know something could be better! + +**PLEASE READ**: If you have a support contract with Google, please create an issue in the [support console](https://cloud.google.com/support/) instead of filing on GitHub. This will ensure a timely response. + +Please run down the following list and make sure you've tried the usual "quick fixes": + + - Search the issues already opened: https://github.com/googleapis/python-bigquery-dataframes/issues + - Search StackOverflow: https://stackoverflow.com/questions/tagged/google-cloud-platform+python + +If you are still having issues, please be sure to include as much information as possible: + +#### Environment details + + - OS type and version: + - Python version: `python --version` + - pip version: `pip --version` + - `bigframes` version: `pip show bigframes` + +#### Steps to reproduce + + 1. ? + 2. ? + +#### Code example + +```python +# example +``` + +#### Stack trace +``` +# example +``` + +Making sure to follow these steps will guarantee the quickest resolution possible. + +Thanks! diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000000..6365857f33 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,18 @@ +--- +name: Feature request +about: Suggest an idea for this library + +--- + +Thanks for stopping by to let us know something could be better! + +**PLEASE READ**: If you have a support contract with Google, please create an issue in the [support console](https://cloud.google.com/support/) instead of filing on GitHub. This will ensure a timely response. + + **Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + **Describe the solution you'd like** +A clear and concise description of what you want to happen. + **Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + **Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/support_request.md b/.github/ISSUE_TEMPLATE/support_request.md new file mode 100644 index 0000000000..9958690321 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/support_request.md @@ -0,0 +1,7 @@ +--- +name: Support request +about: If you have a support contract with Google, please create an issue in the Google Cloud Support console. + +--- + +**PLEASE READ**: If you have a support contract with Google, please create an issue in the [support console](https://cloud.google.com/support/) instead of filing on GitHub. This will ensure a timely response. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000..3e59d9a70d --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,7 @@ +Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: +- [ ] Make sure to open an issue as a [bug/issue](https://github.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea +- [ ] Ensure the tests and linter pass +- [ ] Code coverage does not decrease (if any source code was changed) +- [ ] Appropriate docs were updated (if necessary) + +Fixes # 🦕 diff --git a/.github/auto-approve.yml b/.github/auto-approve.yml new file mode 100644 index 0000000000..311ebbb853 --- /dev/null +++ b/.github/auto-approve.yml @@ -0,0 +1,3 @@ +# https://github.com/googleapis/repo-automation-bots/tree/main/packages/auto-approve +processes: + - "OwlBotTemplateChanges" diff --git a/.github/auto-label.yaml b/.github/auto-label.yaml new file mode 100644 index 0000000000..b2016d119b --- /dev/null +++ b/.github/auto-label.yaml @@ -0,0 +1,15 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +requestsize: + enabled: true diff --git a/.github/header-checker-lint.yml b/.github/header-checker-lint.yml new file mode 100644 index 0000000000..3058bec338 --- /dev/null +++ b/.github/header-checker-lint.yml @@ -0,0 +1,15 @@ +{"allowedCopyrightHolders": ["Google LLC"], + "allowedLicenses": ["Apache-2.0", "MIT", "BSD-3"], + "ignoreFiles": ["**/requirements.txt", "**/requirements-test.txt", "**/__init__.py", "samples/**/constraints.txt", "samples/**/constraints-test.txt"], + "sourceFileExtensions": [ + "ts", + "js", + "java", + "sh", + "Dockerfile", + "yaml", + "py", + "html", + "txt" + ] +} diff --git a/.github/release-please.yml b/.github/release-please.yml new file mode 100644 index 0000000000..466597e5b1 --- /dev/null +++ b/.github/release-please.yml @@ -0,0 +1,2 @@ +releaseType: python +handleGHRelease: true diff --git a/.github/release-trigger.yml b/.github/release-trigger.yml new file mode 100644 index 0000000000..d4ca94189e --- /dev/null +++ b/.github/release-trigger.yml @@ -0,0 +1 @@ +enabled: true diff --git a/.github/snippet-bot.yml b/.github/snippet-bot.yml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.github/sync-repo-settings.yaml b/.github/sync-repo-settings.yaml new file mode 100644 index 0000000000..5004e872b4 --- /dev/null +++ b/.github/sync-repo-settings.yaml @@ -0,0 +1,31 @@ +# https://github.com/googleapis/repo-automation-bots/tree/main/packages/sync-repo-settings +# Rules for main branch protection +branchProtectionRules: +# Identifies the protection rule pattern. Name of the branch to be protected. +# Defaults to `main` +- pattern: main + requiresCodeOwnerReviews: true + requiresStrictStatusChecks: true + requiredStatusCheckContexts: + - 'cla/google' + - 'OwlBot Post Processor' + - 'docs' + - 'lint' + - 'unit (3.9)' + - 'unit (3.10)' + - 'unit (3.11)' + - 'cover' + # TODO(tswast): add Kokoro once we've enabled it +permissionRules: + - team: actools-python + permission: admin + - team: actools + permission: admin + - team: api-bigquery-dataframe + permission: push + - team: yoshi-python + permission: push + - team: python-samples-owners + permission: push + - team: python-samples-reviewers + permission: push diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000..e97d89e484 --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,38 @@ +on: + pull_request: + branches: + - main +name: docs +jobs: + docs: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run docs + run: | + nox -s docs + docfx: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.9" + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run docfx + run: | + nox -s docfx diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000..d2aee5b7d8 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,25 @@ +on: + pull_request: + branches: + - main +name: lint +jobs: + lint: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run lint + run: | + nox -s lint + - name: Run lint_setup_py + run: | + nox -s lint_setup_py diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml new file mode 100644 index 0000000000..bb268fe6c6 --- /dev/null +++ b/.github/workflows/unittest.yml @@ -0,0 +1,57 @@ +on: + pull_request: + branches: + - main +name: unittest +jobs: + unit: + runs-on: ubuntu-latest + strategy: + matrix: + python: ['3.9', '3.10', '3.11'] + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + - name: Install nox + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install nox + - name: Run unit tests + env: + COVERAGE_FILE: .coverage-${{ matrix.python }} + run: | + nox -s unit-${{ matrix.python }} + - name: Upload coverage results + uses: actions/upload-artifact@v3 + with: + name: coverage-artifacts + path: .coverage-${{ matrix.python }} + + cover: + runs-on: ubuntu-latest + needs: + - unit + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install coverage + run: | + python -m pip install --upgrade setuptools pip wheel + python -m pip install coverage + - name: Download coverage results + uses: actions/download-artifact@v3 + with: + name: coverage-artifacts + path: .coverage-results/ + - name: Report coverage results + run: | + coverage combine .coverage-results/.coverage* + coverage report --show-missing --fail-under=40 diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..8b58ae9c01 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,7 @@ +# Security Policy + +To report a security issue, please use [g.co/vulnz](https://g.co/vulnz). + +The Google Security Team will respond within 5 working days of your report on g.co/vulnz. + +We use g.co/vulnz for our intake, and do coordination and disclosure here using GitHub Security Advisory to privately discuss and fix the issue. diff --git a/owlbot.py b/owlbot.py new file mode 100644 index 0000000000..4ba7d14eb5 --- /dev/null +++ b/owlbot.py @@ -0,0 +1,68 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""This script is used to synthesize generated parts of this library.""" + +import pathlib + +from synthtool import gcp +import synthtool as s +from synthtool.languages import python + +REPO_ROOT = pathlib.Path(__file__).parent.absolute() + +common = gcp.CommonTemplates() + +# ---------------------------------------------------------------------------- +# Add templated files +# ---------------------------------------------------------------------------- + +templated_files = common.py_library( + unit_test_python_versions=["3.9", "3.10", "3.11"], + system_test_python_versions=["3.9", "3.11"], + cov_level=40, + intersphinx_dependencies={ + "pandas": "https://pandas.pydata.org/pandas-docs/stable/", + "pydata-google-auth": "https://pydata-google-auth.readthedocs.io/en/latest/", + }, +) +s.move( + templated_files, + excludes=[ + # Multi-processing note isn't relevant, as pandas_gbq is responsible for + # creating clients, not the end user. + "docs/multiprocessing.rst", + "noxfile.py", + "README.rst", + ], +) + +# ---------------------------------------------------------------------------- +# Fixup files +# ---------------------------------------------------------------------------- + + +# ---------------------------------------------------------------------------- +# Samples templates +# ---------------------------------------------------------------------------- + +python.py_samples(skip_readmes=True) + +# ---------------------------------------------------------------------------- +# Final cleanup +# ---------------------------------------------------------------------------- + +s.shell.run(["nox", "-s", "blacken"], hide_output=False) +for noxfile in REPO_ROOT.glob("samples/**/noxfile.py"): + s.shell.run(["nox", "-s", "blacken"], cwd=noxfile.parent, hide_output=False) From bf6ecb81afeb199b3dad07d1fd2057668352f939 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 29 Aug 2023 10:16:44 -0700 Subject: [PATCH 4/7] chore(deps): bump cryptography from 41.0.1 to 41.0.3 in /.kokoro (#3) Bumps [cryptography](https://github.com/pyca/cryptography) from 41.0.1 to 41.0.3. - [Changelog](https://github.com/pyca/cryptography/blob/main/CHANGELOG.rst) - [Commits](https://github.com/pyca/cryptography/compare/41.0.1...41.0.3) --- updated-dependencies: - dependency-name: cryptography dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .kokoro/requirements.txt | 45 ++++++++++++++++++++++------------------ 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/.kokoro/requirements.txt b/.kokoro/requirements.txt index 37a007667f..fd0e0b9d48 100644 --- a/.kokoro/requirements.txt +++ b/.kokoro/requirements.txt @@ -123,26 +123,30 @@ commonmark==0.9.1 \ --hash=sha256:452f9dc859be7f06631ddcb328b6919c67984aca654e5fefb3914d54691aed60 \ --hash=sha256:da2f38c92590f83de410ba1a3cbceafbc74fee9def35f9251ba9a971d6d66fd9 # via recommonmark -cryptography==41.0.1 \ - --hash=sha256:059e348f9a3c1950937e1b5d7ba1f8e968508ab181e75fc32b879452f08356db \ - --hash=sha256:1a5472d40c8f8e91ff7a3d8ac6dfa363d8e3138b961529c996f3e2df0c7a411a \ - --hash=sha256:1a8e6c2de6fbbcc5e14fd27fb24414507cb3333198ea9ab1258d916f00bc3039 \ - --hash=sha256:1fee5aacc7367487b4e22484d3c7e547992ed726d14864ee33c0176ae43b0d7c \ - --hash=sha256:5d092fdfedaec4cbbffbf98cddc915ba145313a6fdaab83c6e67f4e6c218e6f3 \ - --hash=sha256:5f0ff6e18d13a3de56f609dd1fd11470918f770c6bd5d00d632076c727d35485 \ - --hash=sha256:7bfc55a5eae8b86a287747053140ba221afc65eb06207bedf6e019b8934b477c \ - --hash=sha256:7fa01527046ca5facdf973eef2535a27fec4cb651e4daec4d043ef63f6ecd4ca \ - --hash=sha256:8dde71c4169ec5ccc1087bb7521d54251c016f126f922ab2dfe6649170a3b8c5 \ - --hash=sha256:8f4ab7021127a9b4323537300a2acfb450124b2def3756f64dc3a3d2160ee4b5 \ - --hash=sha256:948224d76c4b6457349d47c0c98657557f429b4e93057cf5a2f71d603e2fc3a3 \ - --hash=sha256:9a6c7a3c87d595608a39980ebaa04d5a37f94024c9f24eb7d10262b92f739ddb \ - --hash=sha256:b46e37db3cc267b4dea1f56da7346c9727e1209aa98487179ee8ebed09d21e43 \ - --hash=sha256:b4ceb5324b998ce2003bc17d519080b4ec8d5b7b70794cbd2836101406a9be31 \ - --hash=sha256:cb33ccf15e89f7ed89b235cff9d49e2e62c6c981a6061c9c8bb47ed7951190bc \ - --hash=sha256:d198820aba55660b4d74f7b5fd1f17db3aa5eb3e6893b0a41b75e84e4f9e0e4b \ - --hash=sha256:d34579085401d3f49762d2f7d6634d6b6c2ae1242202e860f4d26b046e3a1006 \ - --hash=sha256:eb8163f5e549a22888c18b0d53d6bb62a20510060a22fd5a995ec8a05268df8a \ - --hash=sha256:f73bff05db2a3e5974a6fd248af2566134d8981fd7ab012e5dd4ddb1d9a70699 +cryptography==41.0.3 \ + --hash=sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306 \ + --hash=sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84 \ + --hash=sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47 \ + --hash=sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d \ + --hash=sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116 \ + --hash=sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207 \ + --hash=sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81 \ + --hash=sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087 \ + --hash=sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd \ + --hash=sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507 \ + --hash=sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858 \ + --hash=sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae \ + --hash=sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34 \ + --hash=sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906 \ + --hash=sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd \ + --hash=sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922 \ + --hash=sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7 \ + --hash=sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4 \ + --hash=sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574 \ + --hash=sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1 \ + --hash=sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c \ + --hash=sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e \ + --hash=sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de # via # gcp-releasetool # secretstorage @@ -416,6 +420,7 @@ protobuf==3.20.3 \ # gcp-docuploader # gcp-releasetool # google-api-core + # googleapis-common-protos pyasn1==0.5.0 \ --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \ --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde From cccac8c16c2daeb4f2defe4562cf099b1c738e07 Mon Sep 17 00:00:00 2001 From: Ashley Xu <139821907+ashleyxuu@users.noreply.github.com> Date: Wed, 30 Aug 2023 11:42:04 -0700 Subject: [PATCH 5/7] chore: set up the kokoro release pipeline (#14) --- .kokoro/release.sh | 29 ++++++++++++++++++++++ .kokoro/release/common.cfg | 49 +++++++++++++++++++++++++++++++++++++ .kokoro/release/release.cfg | 1 + 3 files changed, 79 insertions(+) create mode 100644 .kokoro/release.sh create mode 100644 .kokoro/release/common.cfg create mode 100644 .kokoro/release/release.cfg diff --git a/.kokoro/release.sh b/.kokoro/release.sh new file mode 100644 index 0000000000..697b04009d --- /dev/null +++ b/.kokoro/release.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eo pipefail + +# Start the releasetool reporter +python3 -m pip install --require-hashes -r github/python-bigquery-dataframes/.kokoro/requirements.txt +python3 -m releasetool publish-reporter-script > /tmp/publisher-script; source /tmp/publisher-script + +# Disable buffering, so that the logs stream through. +export PYTHONUNBUFFERED=1 + +# Move into the package, build the distribution and upload. +TWINE_PASSWORD=$(cat "${KOKORO_KEYSTORE_DIR}/73713_google-cloud-pypi-token-keystore-1") +cd github/python-bigquery-dataframes +python3 setup.py sdist bdist_wheel +twine upload --username __token__ --password "${TWINE_PASSWORD}" dist/* \ No newline at end of file diff --git a/.kokoro/release/common.cfg b/.kokoro/release/common.cfg new file mode 100644 index 0000000000..7ffa79c7a1 --- /dev/null +++ b/.kokoro/release/common.cfg @@ -0,0 +1,49 @@ +# Format: //devtools/kokoro/config/proto/build.proto + +# Build logs will be here +action { + define_artifacts { + regex: "**/*sponge_log.xml" + } +} + +# Download trampoline resources. +gfile_resources: "/bigstore/cloud-devrel-kokoro-resources/trampoline" + +# Use the trampoline script to run in docker. +build_file: "python-bigquery-dataframes/.kokoro/trampoline_v2.sh" + +# Configure the docker image for kokoro-trampoline. +env_vars: { + key: "TRAMPOLINE_IMAGE" + value: "gcr.io/cloud-devrel-kokoro-resources/python-multi" +} +env_vars: { + key: "TRAMPOLINE_BUILD_FILE" + value: "github/python-bigquery-dataframes/.kokoro/release.sh" +} + +# Fetch PyPI password +before_action { + fetch_keystore { + keystore_resource { + keystore_config_id: 73713 + keyname: "google-cloud-pypi-token-keystore-1" + } + } +} + +# Tokens needed to report release status back to GitHub +env_vars: { + key: "SECRET_MANAGER_KEYS" + value: "releasetool-publish-reporter-app,releasetool-publish-reporter-googleapis-installation,releasetool-publish-reporter-pem" +} + +# Store the packages we uploaded to PyPI. That way, we have a record of exactly +# what we published, which we can use to generate SBOMs and attestations. +action { + define_artifacts { + regex: "github/python-bigquery-dataframes/**/*.tar.gz" + strip_prefix: "github/python-bigquery-dataframes" + } +} \ No newline at end of file diff --git a/.kokoro/release/release.cfg b/.kokoro/release/release.cfg new file mode 100644 index 0000000000..8f43917d92 --- /dev/null +++ b/.kokoro/release/release.cfg @@ -0,0 +1 @@ +# Format: //devtools/kokoro/config/proto/build.proto \ No newline at end of file From 8fab75576757230bca5c7df10994837ac406300f Mon Sep 17 00:00:00 2001 From: Tim Swast Date: Sat, 2 Sep 2023 09:14:42 -0500 Subject: [PATCH 6/7] chore: sync changes from internal repo (#15) feat: support `DataFrame.isin` with list and dict inputs test: move flaky `reset_session` test to unit tests chore: don't run redundant tests in nightly build feat: support `DataFrame`-`DataFrame` binary operations feat: support `Series.map` feat: support `Index.is_monotonic` docs: update remote function notebook with read_gbq_function usage feat: use default session and connection in `ml.llm` and `ml.imported` chore: disable broken stack tests feat: support `pow()` and power operator in `DataFrame` and `Series` feat: support for `np.add`, `np.subtract`, `np.multiply`, `np.divide`, `np.power` perf: use `row_number()` filter for `head` / `tail` feat: support `bigframes.pandas.merge()` fix: make `X_train` argument names consistent across methods chore: refactor ml core feat: add `Series.dropna` and `DataFrame.fillna` chore: fix gcs notebooks upload in 'nightly' build chore: fix Kokoro build files to support GitHub directories chore: fix unit test to not require authentication --- .kokoro/build.sh | 8 +- .kokoro/continuous/nightly.cfg | 5 - .kokoro/release-nightly.sh | 29 +- OWNERS | 1 + bigframes/_config/bigquery_options.py | 22 +- bigframes/clients.py | 163 ++++ bigframes/core/__init__.py | 14 - bigframes/core/blocks.py | 71 ++ bigframes/core/indexes/index.py | 30 + bigframes/core/joins/__init__.py | 2 + bigframes/core/joins/merge.py | 67 ++ bigframes/dataframe.py | 180 ++++- bigframes/ml/cluster.py | 2 +- bigframes/ml/core.py | 58 +- bigframes/ml/decomposition.py | 2 +- bigframes/ml/imported.py | 16 +- bigframes/ml/llm.py | 88 +- bigframes/ml/sql.py | 16 +- bigframes/operations/__init__.py | 123 ++- bigframes/pandas/__init__.py | 34 +- bigframes/remote_function.py | 115 +-- bigframes/series.py | 118 ++- bigframes/session.py | 38 +- notebooks/experimental/longer_ml_demo.ipynb | 34 +- .../getting_started/ml_fundamentals.ipynb | 58 +- .../sklearn_linear_regression.ipynb | 8 +- .../remote_functions/remote_function.ipynb | 749 +++++++++++++----- tests/system/conftest.py | 18 +- tests/system/large/ml/test_core.py | 16 +- tests/system/large/ml/test_ensemble.py | 64 +- tests/system/large/ml/test_forecasting.py | 6 +- tests/system/large/ml/test_linear_model.py | 32 +- tests/system/small/ml/conftest.py | 40 +- tests/system/small/ml/test_ensemble.py | 72 +- tests/system/small/ml/test_imported.py | 10 + tests/system/small/ml/test_linear_model.py | 36 +- tests/system/small/ml/test_llm.py | 28 + tests/system/small/test_dataframe.py | 133 +++- tests/system/small/test_index.py | 14 + tests/system/small/test_multiindex.py | 68 +- tests/system/small/test_numpy.py | 66 ++ tests/system/small/test_pandas.py | 104 +++ tests/system/small/test_pandas_options.py | 43 - tests/system/small/test_remote_function.py | 35 +- tests/system/small/test_series.py | 89 +++ tests/unit/_config/test_bigquery_options.py | 4 +- tests/unit/conftest.py | 223 ------ .../core/{test_utils.py => test_bf_utils.py} | 0 tests/unit/resources.py | 73 ++ tests/unit/test_core.py | 62 +- tests/unit/test_pandas.py | 39 + tests/unit/test_session.py | 15 +- .../bigframes_vendored/pandas/core/frame.py | 98 +++ .../pandas/core/reshape/merge.py | 78 ++ .../bigframes_vendored/pandas/core/series.py | 102 ++- 55 files changed, 2617 insertions(+), 1002 deletions(-) create mode 100644 bigframes/clients.py create mode 100644 bigframes/core/joins/merge.py delete mode 100644 tests/unit/conftest.py rename tests/unit/core/{test_utils.py => test_bf_utils.py} (100%) create mode 100644 tests/unit/resources.py create mode 100644 third_party/bigframes_vendored/pandas/core/reshape/merge.py diff --git a/.kokoro/build.sh b/.kokoro/build.sh index f80cf6eab9..402ac0eb8c 100755 --- a/.kokoro/build.sh +++ b/.kokoro/build.sh @@ -15,8 +15,14 @@ set -eo pipefail +if [[ -z "${KOKORO_GOB_COMMIT}" ]]; then + PROJECT_SCM="github" +else + PROJECT_SCM="git" +fi + if [[ -z "${PROJECT_ROOT:-}" ]]; then - PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/git/bigframes" + PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/${PROJECT_SCM}/bigframes" fi cd "${PROJECT_ROOT}" diff --git a/.kokoro/continuous/nightly.cfg b/.kokoro/continuous/nightly.cfg index ac34c4b0c6..63c3f51d05 100644 --- a/.kokoro/continuous/nightly.cfg +++ b/.kokoro/continuous/nightly.cfg @@ -1,8 +1,3 @@ # Format: //devtools/kokoro/config/proto/build.proto -env_vars: { - key: "NOX_SESSION" - value: "unit system cover lint lint_setup_py mypy format docs e2e notebook" -} - build_file: "bigframes/.kokoro/release-nightly.sh" diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh index e3b6b4d449..582808a15c 100755 --- a/.kokoro/release-nightly.sh +++ b/.kokoro/release-nightly.sh @@ -34,8 +34,14 @@ while [ $# -gt 0 ] ; do shift 1; done +if [[ -z "${KOKORO_GOB_COMMIT}" ]]; then + PROJECT_SCM="github" +else + PROJECT_SCM="git" +fi + if [ -z "${PROJECT_ROOT:-}" ]; then - PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/git/bigframes" + PROJECT_ROOT="${KOKORO_ARTIFACTS_DIR}/${PROJECT_SCM}/bigframes" fi # Move into the package, build the distribution and upload to shared bucket. @@ -57,16 +63,6 @@ export PYTHONUNBUFFERED=1 # Install dependencies, as the following steps depend on it python3.10 -m pip install -e .[all] -# If NOX_SESSION is set, it only runs the specified session, -# otherwise run all the sessions. -if ! [ ${DRY_RUN} ]; then - if [ -n "${NOX_SESSION:-}" ]; then - python3.10 -m nox -s ${NOX_SESSION:-} - else - python3.10 -m nox - fi -fi - # Generate third party notices and include it in the licenses in setup.cfg # TODO(shobs): Don't include it in the package once vertex colab can pick it # from elsewhere @@ -138,15 +134,8 @@ if ! [ ${DRY_RUN} ]; then gsutil cp -v dist/* ${gcs_path} gsutil cp -v LICENSE ${gcs_path} gsutil cp -v ${THIRD_PARTY_NOTICES_FILE} ${gcs_path} - gsutil -m cp -v "notebooks/00 - Summary.ipynb" \ - "notebooks/01 - Getting Started.ipynb" \ - "notebooks/02 - DataFrame.ipynb" \ - "notebooks/03 - Using ML - ML fundamentals.ipynb" \ - "notebooks/04 - Using ML - SKLearn linear regression.ipynb" \ - "notebooks/05 - Using ML - Easy linear regression.ipynb" \ - "notebooks/06 - Using ML - Large Language Models.ipynb" \ - "notebooks/50 - Remote Function.ipynb" \ - ${gcs_path}notebooks/ + gsutil -m cp -r -v "notebooks/" ${gcs_path}notebooks/ + done # publish API coverage information to BigQuery diff --git a/OWNERS b/OWNERS index f2a0b4383d..672da38afa 100644 --- a/OWNERS +++ b/OWNERS @@ -5,6 +5,7 @@ henryjsolberg@google.com hormati@google.com huanc@google.com jiaxun@google.com +kemppeterson@google.com shobs@google.com swast@google.com tbergeron@google.com diff --git a/bigframes/_config/bigquery_options.py b/bigframes/_config/bigquery_options.py index a103abe190..ea1864ed5f 100644 --- a/bigframes/_config/bigquery_options.py +++ b/bigframes/_config/bigquery_options.py @@ -35,13 +35,13 @@ def __init__( credentials: Optional[google.auth.credentials.Credentials] = None, project: Optional[str] = None, location: Optional[str] = None, - remote_udf_connection: Optional[str] = None, + bq_connection: Optional[str] = None, use_regional_endpoints: bool = False, ): self._credentials = credentials self._project = project self._location = location - self._remote_udf_connection = remote_udf_connection + self._bq_connection = bq_connection self._use_regional_endpoints = use_regional_endpoints self._session_started = False @@ -82,23 +82,21 @@ def project(self, value: Optional[str]): self._project = value @property - def remote_udf_connection(self) -> Optional[str]: - """Name of the BigQuery connection to use for remote functions. + def bq_connection(self) -> Optional[str]: + """Name of the BigQuery connection to use. You should either have the connection already created in the location you have chosen, or you should have the Project IAM Admin role to enable the service to create the connection for you if you need it. """ - return self._remote_udf_connection + return self._bq_connection - @remote_udf_connection.setter - def remote_udf_connection(self, value: Optional[str]): - if self._session_started and self._remote_udf_connection != value: - raise ValueError( - SESSION_STARTED_MESSAGE.format(attribute="remote_udf_connection") - ) - self._remote_udf_connection = value + @bq_connection.setter + def bq_connection(self, value: Optional[str]): + if self._session_started and self._bq_connection != value: + raise ValueError(SESSION_STARTED_MESSAGE.format(attribute="bq_connection")) + self._bq_connection = value @property def use_regional_endpoints(self) -> bool: diff --git a/bigframes/clients.py b/bigframes/clients.py new file mode 100644 index 0000000000..5c019e0fc8 --- /dev/null +++ b/bigframes/clients.py @@ -0,0 +1,163 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""BigQuery DataFrame clients to interact with other cloud resources""" + +from __future__ import annotations + +import logging +import time +from typing import Optional + +import google.api_core.exceptions +from google.cloud import bigquery_connection_v1, resourcemanager_v3 +from google.iam.v1 import iam_policy_pb2, policy_pb2 + +logging.basicConfig( + level=logging.INFO, format="[%(levelname)s][%(asctime)s][%(name)s] %(message)s" +) +logger = logging.getLogger(__name__) + + +class BqConnectionManager: + """Manager to handle operations with BQ connections.""" + + # Wait time (in seconds) for an IAM binding to take effect after creation + _IAM_WAIT_SECONDS = 120 + + def __init__( + self, + bq_connection_client: bigquery_connection_v1.ConnectionServiceClient, + cloud_resource_manager_client: resourcemanager_v3.ProjectsClient, + ): + self._bq_connection_client = bq_connection_client + self._cloud_resource_manager_client = cloud_resource_manager_client + + def create_bq_connection( + self, project_id: str, location: str, connection_id: str, iam_role: str + ): + """Create the BQ connection if not exist. In addition, try to add the IAM role to the connection to ensure required permissions. + + Args: + project_id: + ID of the project. + location: + Location of the connection. + connection_id: + ID of the connection. + iam_role: + str of the IAM role that the service account of the created connection needs to aquire. E.g. 'run.invoker', 'aiplatform.user' + """ + # TODO(shobs): The below command to enable BigQuery Connection API needs + # to be automated. Disabling for now since most target users would not + # have the privilege to enable API in a project. + # log("Making sure BigQuery Connection API is enabled") + # if os.system("gcloud services enable bigqueryconnection.googleapis.com"): + # raise ValueError("Failed to enable BigQuery Connection API") + # If the intended connection does not exist then create it + service_account_id = self._get_service_account_if_connection_exists( + project_id, location, connection_id + ) + if service_account_id: + logger.info( + f"Connector {project_id}.{location}.{connection_id} already exists" + ) + else: + connection_name, service_account_id = self._create_bq_connection( + project_id, location, connection_id + ) + logger.info( + f"Created BQ connection {connection_name} with service account id: {service_account_id}" + ) + # Ensure IAM role on the BQ connection + # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function + self._ensure_iam_binding(project_id, service_account_id, iam_role) + + # Introduce retries to accommodate transient errors like etag mismatch, + # which can be caused by concurrent operation on the same resource, and + # manifests with message like: + # google.api_core.exceptions.Aborted: 409 There were concurrent policy + # changes. Please retry the whole read-modify-write with exponential + # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match + # the current policy's ETag '\007\006\003,\3750&\363'. + @google.api_core.retry.Retry( + predicate=google.api_core.retry.if_exception_type( + google.api_core.exceptions.Aborted + ), + initial=10, + maximum=20, + multiplier=2, + timeout=60, + ) + def _ensure_iam_binding( + self, project_id: str, service_account_id: str, iam_role: str + ): + """Ensure necessary IAM role is configured on a service account.""" + project = f"projects/{project_id}" + service_account = f"serviceAccount:{service_account_id}" + role = f"roles/{iam_role}" + request = iam_policy_pb2.GetIamPolicyRequest(resource=project) + policy = self._cloud_resource_manager_client.get_iam_policy(request=request) + + # Check if the binding already exists, and if does, do nothing more + for binding in policy.bindings: + if binding.role == role: + if service_account in binding.members: + return + + # Create a new binding + new_binding = policy_pb2.Binding(role=role, members=[service_account]) + policy.bindings.append(new_binding) + request = iam_policy_pb2.SetIamPolicyRequest(resource=project, policy=policy) + self._cloud_resource_manager_client.set_iam_policy(request=request) + + # We would wait for the IAM policy change to take effect + # https://cloud.google.com/iam/docs/access-change-propagation + logger.info( + f"Waiting {self._IAM_WAIT_SECONDS} seconds for IAM to take effect.." + ) + time.sleep(self._IAM_WAIT_SECONDS) + + def _create_bq_connection(self, project_id: str, location: str, connection_id: str): + """Create the BigQuery Connection and returns corresponding service account id.""" + client = self._bq_connection_client + connection = bigquery_connection_v1.Connection( + cloud_resource=bigquery_connection_v1.CloudResourceProperties() + ) + request = bigquery_connection_v1.CreateConnectionRequest( + parent=client.common_location_path(project_id, location), + connection_id=connection_id, + connection=connection, + ) + connection = client.create_connection(request) + return connection.name, connection.cloud_resource.service_account_id + + def _get_service_account_if_connection_exists( + self, project_id: str, location: str, connection_id: str + ) -> Optional[str]: + """Check if the BigQuery Connection exists.""" + client = self._bq_connection_client + request = bigquery_connection_v1.GetConnectionRequest( + name=client.connection_path(project_id, location, connection_id) + ) + + service_account = None + try: + service_account = client.get_connection( + request=request + ).cloud_resource.service_account_id + except google.api_core.exceptions.NotFound: + pass + + return service_account diff --git a/bigframes/core/__init__.py b/bigframes/core/__init__.py index d6509e4c0a..7086269af9 100644 --- a/bigframes/core/__init__.py +++ b/bigframes/core/__init__.py @@ -1021,20 +1021,6 @@ def slice( if not step: step = 1 - # Special cases for head() and tail(), where we don't need to project - # offsets. LIMIT clause is much more efficient in BigQuery than a - # filter on row_number(). - if ( - (start is None or start == 0) - and step == 1 - and stop is not None - and stop > 0 - ): - return self.apply_limit(stop) - - if start is not None and start < 0 and step == 1 and stop is None: - return self.reversed().apply_limit(abs(start)).reversed() - expr_with_offsets = self.project_offsets() # start with True and reduce with start, stop, and step conditions diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index f23a4d0b5c..482cfd0141 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -62,6 +62,10 @@ _UNIFORM = "uniform" _SAMPLING_METHODS = (_HEAD, _UNIFORM) +# Monotonic Cache Names +_MONOTONIC_INCREASING = "monotonic_increasing" +_MONOTONIC_DECREASING = "monotonic_decreasing" + class BlockHolder(typing.Protocol): """Interface for mutable objects with state represented by a block value object.""" @@ -119,6 +123,8 @@ def __init__( } # TODO(kemppeterson) Add a cache for corr to parallel the single-column stats. + self._stats_cache[" ".join(self.index_columns)] = {} + @property def index(self) -> indexes.IndexValue: """Row identities for values in the Block.""" @@ -1408,6 +1414,71 @@ def _force_reproject(self) -> Block: index_labels=self.index.names, ) + def is_monotonic_increasing( + self, column_id: typing.Union[str, Sequence[str]] + ) -> bool: + return self._is_monotonic(column_id, increasing=True) + + def is_monotonic_decreasing( + self, column_id: typing.Union[str, Sequence[str]] + ) -> bool: + return self._is_monotonic(column_id, increasing=False) + + def _is_monotonic( + self, column_ids: typing.Union[str, Sequence[str]], increasing: bool + ) -> bool: + if isinstance(column_ids, str): + column_ids = (column_ids,) + + op_name = _MONOTONIC_INCREASING if increasing else _MONOTONIC_DECREASING + + column_name = " ".join(column_ids) + if op_name in self._stats_cache[column_name]: + return self._stats_cache[column_name][op_name] + + period = 1 + window = bigframes.core.WindowSpec( + preceding=period, + following=None, + ) + + # any NaN value means not monotonic + block, last_notna_id = self.apply_unary_op(column_ids[0], ops.notnull_op) + for column_id in column_ids[1:]: + block, notna_id = block.apply_unary_op(column_id, ops.notnull_op) + block, last_notna_id = block.apply_binary_op( + last_notna_id, notna_id, ops.and_op + ) + + # loop over all columns to check monotonicity + last_result_id = None + for column_id in column_ids[::-1]: + block, lag_result_id = block.apply_window_op( + column_id, agg_ops.ShiftOp(period), window + ) + block, strict_monotonic_id = block.apply_binary_op( + column_id, lag_result_id, ops.gt_op if increasing else ops.lt_op + ) + block, equal_id = block.apply_binary_op(column_id, lag_result_id, ops.eq_op) + if last_result_id is None: + block, last_result_id = block.apply_binary_op( + equal_id, strict_monotonic_id, ops.or_op + ) + continue + block, equal_monotonic_id = block.apply_binary_op( + equal_id, last_result_id, ops.and_op + ) + block, last_result_id = block.apply_binary_op( + equal_monotonic_id, strict_monotonic_id, ops.or_op + ) + + block, monotonic_result_id = block.apply_binary_op( + last_result_id, last_notna_id, ops.and_op # type: ignore + ) + result = block.get_stat(monotonic_result_id, agg_ops.all_op) + self._stats_cache[column_name].update({op_name: result}) + return result + def block_from_local(data, session=None, use_index=True) -> Block: # TODO(tbergeron): Handle duplicate column labels diff --git a/bigframes/core/indexes/index.py b/bigframes/core/indexes/index.py index 4eb37e6d92..04b9a36b64 100644 --- a/bigframes/core/indexes/index.py +++ b/bigframes/core/indexes/index.py @@ -67,6 +67,36 @@ def empty(self) -> bool: """Returns True if the Index is empty, otherwise returns False.""" return self.shape[0] == 0 + @property + def is_monotonic_increasing(self) -> bool: + """ + Return a boolean if the values are equal or increasing. + + Returns: + bool + """ + return typing.cast( + bool, + self._data._get_block().is_monotonic_increasing( + self._data._get_block().index_columns + ), + ) + + @property + def is_monotonic_decreasing(self) -> bool: + """ + Return a boolean if the values are equal or decreasing. + + Returns: + bool + """ + return typing.cast( + bool, + self._data._get_block().is_monotonic_decreasing( + self._data._get_block().index_columns + ), + ) + def __getitem__(self, key: int) -> typing.Any: if isinstance(key, int): result_pd_df, _ = self._data._get_block().slice(key, key + 1, 1).to_pandas() diff --git a/bigframes/core/joins/__init__.py b/bigframes/core/joins/__init__.py index 58a1c2cfd7..3f9447aef0 100644 --- a/bigframes/core/joins/__init__.py +++ b/bigframes/core/joins/__init__.py @@ -14,10 +14,12 @@ """Helpers to join ArrayValue objects.""" +from bigframes.core.joins.merge import merge from bigframes.core.joins.row_identity import join_by_row_identity from bigframes.core.joins.single_column import join_by_column __all__ = ( "join_by_row_identity", "join_by_column", + "merge", ) diff --git a/bigframes/core/joins/merge.py b/bigframes/core/joins/merge.py new file mode 100644 index 0000000000..fac16b3607 --- /dev/null +++ b/bigframes/core/joins/merge.py @@ -0,0 +1,67 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Functions for Merging Data Structures in BigFrames. +""" + +from __future__ import annotations + +from typing import Literal, Optional + +from bigframes.dataframe import DataFrame +from bigframes.series import Series + + +def merge( + left: DataFrame, + right: DataFrame, + how: Literal[ + "inner", + "left", + "outer", + "right", + ] = "inner", + on: Optional[str] = None, + *, + left_on: Optional[str] = None, + right_on: Optional[str] = None, + sort: bool = False, + suffixes: tuple[str, str] = ("_x", "_y"), +) -> DataFrame: + left = _validate_operand(left) + right = _validate_operand(right) + + return left.merge( + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + sort=sort, + suffixes=suffixes, + ) + + +def _validate_operand(obj: DataFrame | Series) -> DataFrame: + if isinstance(obj, DataFrame): + return obj + elif isinstance(obj, Series): + if obj.name is None: + raise ValueError("Cannot merge a Series without a name") + return obj.to_frame() + else: + raise TypeError( + f"Can only merge Series or DataFrame objects, a {type(obj)} was passed" + ) diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index ef443db079..31777f3fac 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -529,7 +529,7 @@ def __setitem__(self, key: str, value: SingleItemValue): def _apply_binop( self, - other: float | int | bigframes.series.Series, + other: float | int | bigframes.series.Series | DataFrame, op, axis: str | int = "columns", ): @@ -537,6 +537,8 @@ def _apply_binop( return self._apply_scalar_binop(other, op) elif isinstance(other, bigframes.series.Series): return self._apply_series_binop(other, op, axis=axis) + elif isinstance(other, DataFrame): + return self._apply_dataframe_binop(other, op) raise NotImplementedError( f"binary operation is not implemented on the second operand of type {type(other).__name__}." f"{constants.FEEDBACK_LINK}" @@ -588,6 +590,47 @@ def _apply_series_binop( block = block.with_index_labels(self.index.names) return DataFrame(block) + def _apply_dataframe_binop( + self, + other: DataFrame, + op: ops.BinaryOp, + ) -> DataFrame: + # Join rows + joined_index, (get_column_left, get_column_right) = self._block.index.join( + other._block.index, how="outer" + ) + # join columns schema + columns, lcol_indexer, rcol_indexer = self.columns.join( + other.columns, how="outer", return_indexers=True + ) + + binop_result_ids = [] + block = joined_index._block + for left_index, right_index in zip(lcol_indexer, rcol_indexer): + if left_index >= 0 and right_index >= 0: # -1 indices indicate missing + left_col_id = self._block.value_columns[left_index] + right_col_id = other._block.value_columns[right_index] + block, result_col_id = block.apply_binary_op( + get_column_left(left_col_id), + get_column_right(right_col_id), + op, + ) + binop_result_ids.append(result_col_id) + elif left_index >= 0: + dtype = self.dtypes[left_index] + block, null_col_id = block.create_constant(None, dtype=dtype) + binop_result_ids.append(null_col_id) + elif right_index >= 0: + dtype = other.dtypes[right_index] + block, null_col_id = block.create_constant(None, dtype=dtype) + binop_result_ids.append(null_col_id) + else: + # Should not be possible + raise ValueError("No right or left index.") + + block = block.select_columns(binop_result_ids).with_column_labels(columns) + return DataFrame(block) + def eq(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: return self._apply_binop(other, ops.eq_op, axis=axis) @@ -619,7 +662,9 @@ def gt(self, other: typing.Any, axis: str | int = "columns") -> DataFrame: __ge__ = ge def add( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: # TODO(swast): Support fill_value parameter. # TODO(swast): Support level parameter with MultiIndex. @@ -628,64 +673,92 @@ def add( __radd__ = __add__ = radd = add def sub( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.sub_op, axis=axis) __sub__ = subtract = sub def rsub( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.reverse(ops.sub_op), axis=axis) __rsub__ = rsub def mul( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.mul_op, axis=axis) __rmul__ = __mul__ = rmul = multiply = mul def truediv( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.div_op, axis=axis) div = divide = __truediv__ = truediv def rtruediv( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.reverse(ops.div_op), axis=axis) __rtruediv__ = rdiv = rtruediv def floordiv( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.floordiv_op, axis=axis) __floordiv__ = floordiv def rfloordiv( - self, other: float | int | bigframes.series.Series, axis: str | int = "columns" + self, + other: float | int | bigframes.series.Series | DataFrame, + axis: str | int = "columns", ) -> DataFrame: return self._apply_binop(other, ops.reverse(ops.floordiv_op), axis=axis) __rfloordiv__ = rfloordiv - def mod(self, other: int | bigframes.series.Series, axis: str | int = "columns") -> DataFrame: # type: ignore + def mod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.mod_op, axis=axis) - def rmod(self, other: int | bigframes.series.Series, axis: str | int = "columns") -> DataFrame: # type: ignore + def rmod(self, other: int | bigframes.series.Series | DataFrame, axis: str | int = "columns") -> DataFrame: # type: ignore return self._apply_binop(other, ops.reverse(ops.mod_op), axis=axis) __mod__ = mod __rmod__ = rmod + def pow( + self, other: int | bigframes.series.Series, axis: str | int = "columns" + ) -> DataFrame: + return self._apply_binop(other, ops.pow_op, axis=axis) + + def rpow( + self, other: int | bigframes.series.Series, axis: str | int = "columns" + ) -> DataFrame: + return self._apply_binop(other, ops.reverse(ops.pow_op), axis=axis) + + __pow__ = pow + + __rpow__ = rpow + def to_pandas( self, max_download_size: Optional[int] = None, @@ -1023,8 +1096,72 @@ def add_suffix(self, suffix: str, axis: int | str | None = None) -> DataFrame: axis = 1 if axis is None else axis return DataFrame(self._get_block().add_suffix(suffix, axis)) - def dropna(self) -> DataFrame: - return DataFrame(block_ops.dropna(self._block, how="any")) + def fillna(self, value=None) -> DataFrame: + return self._apply_binop(value, ops.fillna_op) + + def isin(self, values) -> DataFrame: + if utils.is_dict_like(values): + block = self._block + result_ids = [] + for col, label in zip(self._block.value_columns, self._block.column_labels): + if label in values.keys(): + value_for_key = values[label] + block, result_id = block.apply_unary_op( + col, ops.IsInOp(value_for_key, match_nulls=True), label + ) + result_ids.append(result_id) + else: + block, result_id = block.create_constant( + False, label=label, dtype=pandas.BooleanDtype() + ) + result_ids.append(result_id) + return DataFrame(block.select_columns(result_ids)).fillna(value=False) + elif utils.is_list_like(values): + return self._apply_unary_op(ops.IsInOp(values, match_nulls=True)).fillna( + value=False + ) + else: + raise TypeError( + "only list-like objects are allowed to be passed to " + f"isin(), you passed a [{type(values).__name__}]" + ) + + def dropna( + self, + *, + axis: int | str = 0, + inplace: bool = False, + how: str = "any", + ignore_index=False, + ) -> DataFrame: + if inplace: + raise NotImplementedError( + "'inplace'=True not supported. {constants.FEEDBACK_LINK}" + ) + if how not in ("any", "all"): + raise ValueError("'how' must be one of 'any', 'all'") + + axis_n = utils.get_axis_number(axis) + + if axis_n == 0: + result = block_ops.dropna(self._block, how=how) # type: ignore + if ignore_index: + result = result.reset_index() + return DataFrame(result) + else: + isnull_block = self._block.multi_apply_unary_op( + self._block.value_columns, ops.isnull_op + ) + if how == "any": + null_locations = DataFrame(isnull_block).any().to_pandas() + else: # 'all' + null_locations = DataFrame(isnull_block).all().to_pandas() + keep_columns = [ + col + for col, to_drop in zip(self._block.value_columns, null_locations) + if not to_drop + ] + return DataFrame(self._block.select_columns(keep_columns)) def any( self, @@ -1205,7 +1342,7 @@ def pivot( def stack(self): # TODO: support 'level' param by simply reordering levels such that selected level is last before passing to Block.stack. - # TODO: support 'dropna' param by executing dropna only conditionally + # TODO: match impl to pandas future_stack as described in pandas 2.1 release notes result_block = block_ops.dropna(self._block.stack(), how="all") if not isinstance(self.columns, pandas.MultiIndex): return bigframes.series.Series(result_block) @@ -1879,16 +2016,17 @@ def __array_ufunc__( """Used to support numpy ufuncs. See: https://numpy.org/doc/stable/reference/ufuncs.html """ - if ( - inputs[0] is not self - or method != "__call__" - or len(inputs) > 1 - or len(kwargs) > 0 - ): + if method != "__call__" or len(inputs) > 2 or len(kwargs) > 0: return NotImplemented - if ufunc in ops.NUMPY_TO_OP: + if len(inputs) == 1 and ufunc in ops.NUMPY_TO_OP: return self._apply_unary_op(ops.NUMPY_TO_OP[ufunc]) + if len(inputs) == 2 and ufunc in ops.NUMPY_TO_BINOP: + binop = ops.NUMPY_TO_BINOP[ufunc] + if inputs[0] is self: + return self._apply_binop(inputs[1], binop) + else: + return self._apply_binop(inputs[0], ops.reverse(binop)) return NotImplemented diff --git a/bigframes/ml/cluster.py b/bigframes/ml/cluster.py index 9a20fd9ad7..2501d2b21f 100644 --- a/bigframes/ml/cluster.py +++ b/bigframes/ml/cluster.py @@ -67,7 +67,7 @@ def fit( (X,) = utils.convert_to_dataframe(X) self._bqml_model = core.create_bqml_model( - train_X=X, + X_train=X, transforms=transforms, options=self._bqml_options, ) diff --git a/bigframes/ml/core.py b/bigframes/ml/core.py index 27727c9f81..57f610c4c4 100644 --- a/bigframes/ml/core.py +++ b/bigframes/ml/core.py @@ -22,7 +22,6 @@ from google.cloud import bigquery import bigframes -import bigframes.constants as constants from bigframes.ml import sql as ml_sql import bigframes.pandas as bpd @@ -53,10 +52,8 @@ def model(self) -> bigquery.Model: """Get the BQML model associated with this wrapper""" return self._model - @classmethod def _apply_sql( - cls, - session: bigframes.Session, + self, input_data: bpd.DataFrame, func: Callable[[str], str], ) -> bpd.DataFrame: @@ -78,13 +75,8 @@ def _apply_sql( include_index=True ) - if len(index_col_ids) != 1: - raise NotImplementedError( - f"Only exactly one index column is supported. {constants.FEEDBACK_LINK}" - ) - sql = func(source_sql) - df = session.read_gbq(sql, index_col=index_col_ids) + df = self._session.read_gbq(sql, index_col=index_col_ids) df.index.names = index_labels return df @@ -92,7 +84,6 @@ def _apply_sql( def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( - self._session, input_data, lambda source_sql: ml_sql.ml_predict( model_name=self.model_name, source_sql=source_sql @@ -102,7 +93,6 @@ def predict(self, input_data: bpd.DataFrame) -> bpd.DataFrame: def transform(self, input_data: bpd.DataFrame) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( - self._session, input_data, lambda source_sql: ml_sql.ml_transform( model_name=self.model_name, source_sql=source_sql @@ -116,7 +106,6 @@ def generate_text( ) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( - self._session, input_data, lambda source_sql: ml_sql.ml_generate_text( model_name=self.model_name, @@ -132,7 +121,6 @@ def generate_text_embedding( ) -> bpd.DataFrame: # TODO: validate input data schema return self._apply_sql( - self._session, input_data, lambda source_sql: ml_sql.ml_generate_text_embedding( model_name=self.model_name, @@ -209,16 +197,16 @@ def register(self, vertex_ai_model_id: Optional[str] = None) -> BqmlModel: def create_bqml_model( - train_X: bpd.DataFrame, - train_y: Optional[bpd.DataFrame] = None, + X_train: bpd.DataFrame, + y_train: Optional[bpd.DataFrame] = None, transforms: Optional[Iterable[str]] = None, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: """Create a session-temporary BQML model with the CREATE MODEL statement Args: - train_X: features columns for training - train_y: labels columns for training, if applicable + X_train: features columns for training + y_train: labels columns for training, if applicable transforms: an optional list of SQL expressions that implement preprocessing on top of the input data. Generates a BQML TRANSFORM clause options: a dict of options to configure the model. Generates a BQML OPTIONS @@ -227,19 +215,13 @@ def create_bqml_model( Returns: a BqmlModel, wrapping a trained model in BigQuery """ options = dict(options) - if train_y is None: - input_data = train_X + if y_train is None: + input_data = X_train else: - # TODO: handle case where train_y columns are renamed in the join - input_data = train_X.join(train_y, how="outer") - options.update({"INPUT_LABEL_COLS": train_y.columns.tolist()}) + input_data = X_train.join(y_train, how="outer") + options.update({"INPUT_LABEL_COLS": y_train.columns.tolist()}) - # pickpocket session object from the dataframe - session = train_X._get_block().expr._session - - # TODO(garrettwu): add wrapper to select the feature columns - # for now, drop index to avoid including the index in feature columns - input_data = input_data.reset_index(drop=True) + session = X_train._get_block().expr._session source_sql = input_data.sql options_sql = ml_sql.options(**options) @@ -255,25 +237,25 @@ def create_bqml_model( def create_bqml_time_series_model( - train_X: bpd.DataFrame, - train_y: bpd.DataFrame, + X_train: bpd.DataFrame, + y_train: bpd.DataFrame, transforms: Optional[Iterable[str]] = None, options: Mapping[str, Union[str, int, float, Iterable[str]]] = {}, ) -> BqmlModel: assert ( - train_X.columns.size == 1 + X_train.columns.size == 1 ), "Time series timestamp input must only contain 1 column." assert ( - train_y.columns.size == 1 + y_train.columns.size == 1 ), "Time stamp data input must only contain 1 column." options = dict(options) - input_data = train_X.join(train_y, how="outer") - options.update({"TIME_SERIES_TIMESTAMP_COL": train_X.columns.tolist()[0]}) - options.update({"TIME_SERIES_DATA_COL": train_y.columns.tolist()[0]}) - # pickpocket session object from the dataframe - session = train_X._get_block().expr._session + input_data = X_train.join(y_train, how="outer") + options.update({"TIME_SERIES_TIMESTAMP_COL": X_train.columns.tolist()[0]}) + options.update({"TIME_SERIES_DATA_COL": y_train.columns.tolist()[0]}) + + session = X_train._get_block().expr._session source_sql = input_data.sql options_sql = ml_sql.options(**options) diff --git a/bigframes/ml/decomposition.py b/bigframes/ml/decomposition.py index 16106d3a7b..75b57f2e54 100644 --- a/bigframes/ml/decomposition.py +++ b/bigframes/ml/decomposition.py @@ -61,7 +61,7 @@ def fit( (X,) = utils.convert_to_dataframe(X) self._bqml_model = core.create_bqml_model( - train_X=X, + X_train=X, transforms=transforms, options={ "model_type": "PCA", diff --git a/bigframes/ml/imported.py b/bigframes/ml/imported.py index 89078f8267..d4571eb3e5 100644 --- a/bigframes/ml/imported.py +++ b/bigframes/ml/imported.py @@ -34,8 +34,12 @@ class TensorFlowModel(base.Predictor): model_path (str): GCS path that holds the model files.""" - def __init__(self, session: bigframes.Session, model_path: Optional[str] = None): - self.session = session + def __init__( + self, + session: Optional[bigframes.Session] = None, + model_path: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() self.model_path = model_path self._bqml_model: Optional[core.BqmlModel] = None @@ -112,8 +116,12 @@ class ONNXModel(base.Predictor): model_path (str): Cloud Storage path that holds the model files.""" - def __init__(self, session: bigframes.Session, model_path: Optional[str] = None): - self.session = session + def __init__( + self, + session: Optional[bigframes.Session] = None, + model_path: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() self.model_path = model_path self._bqml_model: Optional[core.BqmlModel] = None diff --git a/bigframes/ml/llm.py b/bigframes/ml/llm.py index def97b56ff..345e3deb72 100644 --- a/bigframes/ml/llm.py +++ b/bigframes/ml/llm.py @@ -16,10 +16,10 @@ from __future__ import annotations -from typing import cast, Union +from typing import cast, Optional, Union import bigframes -import bigframes.constants as constants +from bigframes import clients, constants from bigframes.core import blocks from bigframes.ml import base, core, utils import bigframes.pandas as bpd @@ -35,17 +35,43 @@ class PaLM2TextGenerator(base.Predictor): """PaLM2 text generator LLM model. Args: - session (BigQuery Session): - BQ session to create the model - connection_name (str): - connection to connect with remote service. str of the format ..""" - - def __init__(self, session: bigframes.Session, connection_name: str): - self.session = session - self.connection_name = connection_name + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + connection to connect with remote service. str of the format ... + if None, use default connection in session context. + """ + + def __init__( + self, + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() + self.connection_name = connection_name or self.session._bq_connection + self._bq_connection_manager = clients.BqConnectionManager( + self.session.bqconnectionclient, self.session.resourcemanagerclient + ) self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + options = { "remote_service_type": _REMOTE_TEXT_GENERATOR_MODEL_CODE, } @@ -140,17 +166,43 @@ class PaLM2TextEmbeddingGenerator(base.Predictor): """PaLM2 text embedding generator LLM model. Args: - session (BigQuery Session): - BQ session to create the model - connection_name (str): - connection to connect with remote service. str of the format ..""" - - def __init__(self, session: bigframes.Session, connection_name: str): - self.session = session - self.connection_name = connection_name + session (bigframes.Session or None): + BQ session to create the model. If None, use the global default session. + connection_name (str or None): + connection to connect with remote service. str of the format ... + if None, use default connection in session context. + """ + + def __init__( + self, + session: Optional[bigframes.Session] = None, + connection_name: Optional[str] = None, + ): + self.session = session or bpd.get_global_session() + self.connection_name = connection_name or self.session._bq_connection + self._bq_connection_manager = clients.BqConnectionManager( + self.session.bqconnectionclient, self.session.resourcemanagerclient + ) self._bqml_model: core.BqmlModel = self._create_bqml_model() def _create_bqml_model(self): + # Parse and create connection if needed. + if not self.connection_name: + raise ValueError( + "Must provide connection_name, either in constructor or through session options." + ) + connection_name_parts = self.connection_name.split(".") + if len(connection_name_parts) != 3: + raise ValueError( + f"connection_name must be of the format .., got {self.connection_name}." + ) + self._bq_connection_manager.create_bq_connection( + project_id=connection_name_parts[0], + location=connection_name_parts[1], + connection_id=connection_name_parts[2], + iam_role="aiplatform.user", + ) + options = { "remote_service_type": _REMOTE_EMBEDDING_GENERATOR_MODEL_CODE, } diff --git a/bigframes/ml/sql.py b/bigframes/ml/sql.py index bcd8243582..feb7ff7835 100644 --- a/bigframes/ml/sql.py +++ b/bigframes/ml/sql.py @@ -34,8 +34,8 @@ def _encode_value(v: Union[str, int, float, Iterable[str]]) -> str: raise ValueError(f"Unexpected value type. {constants.FEEDBACK_LINK}") -def _build_param_Iterable(**kwargs: Union[str, int, float, Iterable[str]]) -> str: - """Encode a dict of values into a formatted Iterable of KVPs for SQL""" +def _build_parameters(**kwargs: Union[str, int, float, Iterable[str]]) -> str: + """Encode a dict of values into a formatted Iterable of key-value pairs for SQL""" indent_str = " " param_strs = [f"{k}={_encode_value(v)}" for k, v in kwargs.items()] return "\n" + indent_str + f",\n{indent_str}".join(param_strs) @@ -43,10 +43,10 @@ def _build_param_Iterable(**kwargs: Union[str, int, float, Iterable[str]]) -> st def options(**kwargs: Union[str, int, float, Iterable[str]]) -> str: """Encode the OPTIONS clause for BQML""" - return f"OPTIONS({_build_param_Iterable(**kwargs)})" + return f"OPTIONS({_build_parameters(**kwargs)})" -def _build_struct_param_Iterable(**kwargs: Union[int, float]) -> str: +def _build_structs(**kwargs: Union[int, float]) -> str: """Encode a dict of values into a formatted STRUCT items for SQL""" indent_str = " " param_strs = [f"{v} AS {k}" for k, v in kwargs.items()] @@ -55,10 +55,10 @@ def _build_struct_param_Iterable(**kwargs: Union[int, float]) -> str: def struct_options(**kwargs: Union[int, float]) -> str: """Encode a BQ STRUCT as options.""" - return f"STRUCT({_build_struct_param_Iterable(**kwargs)})" + return f"STRUCT({_build_structs(**kwargs)})" -def _build_expr_Iterable(*expr_sqls: str) -> str: +def _build_expressions(*expr_sqls: str) -> str: """Encode a Iterable of SQL expressions into a formatted Iterable for SQL""" indent_str = " " return "\n" + indent_str + f",\n{indent_str}".join(expr_sqls) @@ -66,7 +66,7 @@ def _build_expr_Iterable(*expr_sqls: str) -> str: def transform(*expr_sqls: str) -> str: """Encode the TRANSFORM clause for BQML""" - return f"TRANSFORM({_build_expr_Iterable(*expr_sqls)})" + return f"TRANSFORM({_build_expressions(*expr_sqls)})" def connection(conn_name: str) -> str: @@ -137,7 +137,7 @@ def alter_model( return "\n".join(parts) -def ml_evaluate(model_name: str, source_sql: Union[str, None] = None) -> str: +def ml_evaluate(model_name: str, source_sql: Optional[str] = None) -> str: """Encode ML.EVALUATE for BQML""" if source_sql is None: return f"""SELECT * FROM ML.EVALUATE(MODEL `{model_name}`)""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 9305cf1dda..fa43f725f6 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -38,6 +38,7 @@ # FLOAT64 has 11 exponent bits, so max values is about 2**(2**10) # ln(2**(2**10)) == (2**10)*ln(2) ~= 709.78, so EXP(x) for x>709.78 will overflow. _FLOAT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(709.78)) +_INT64_EXP_BOUND = typing.cast(ibis_types.NumericValue, ibis_types.literal(43.6)) BinaryOp = typing.Callable[[ibis_types.Value, ibis_types.Value], ibis_types.Value] TernaryOp = typing.Callable[ @@ -538,12 +539,27 @@ def __init__(self, values, match_nulls: bool = True): self._match_nulls = match_nulls def _as_ibis(self, x: ibis_types.Value): - if self._match_nulls and any(is_null(value) for value in self._values): - return x.isnull() | x.isin( - [val for val in self._values if not is_null(val)] - ) + contains_nulls = any(is_null(value) for value in self._values) + matchable_ibis_values = [] + for item in self._values: + if not is_null(item): + try: + # we want values that *could* be cast to the dtype, but we don't want + # to actually cast it, as that could be lossy (eg float -> int) + item_inferred_type = ibis.literal(item).type() + if ( + x.type() == item_inferred_type + or x.type().is_numeric() + and item_inferred_type.is_numeric() + ): + matchable_ibis_values.append(item) + except TypeError: + pass + + if self._match_nulls and contains_nulls: + return x.isnull() | x.isin(matchable_ibis_values) else: - return x.isin(self._values) + return x.isin(matchable_ibis_values) class BinopPartialRight(UnaryOp): @@ -746,6 +762,94 @@ def div_op( ) +@short_circuit_nulls(ibis_dtypes.float) +def pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + if x.type().is_integer() and y.type().is_integer(): + return _int_pow_op(x, y) + else: + return _float_pow_op(x, y) + + +def _int_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Need to avoid any error cases - should produce NaN instead + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_as_decimal = typing.cast( + ibis_types.NumericValue, + x.cast(ibis_dtypes.Decimal(precision=38, scale=9, nullable=True)), + ) + y_val = typing.cast(ibis_types.NumericValue, y) + + # BQ POW() function outputs FLOAT64, which can lose precision. + # Therefore, we do math in NUMERIC and cast back down after. + # Also, explicit bounds checks, pandas will silently overflow. + pow_result = x_as_decimal**y_val + overflow_cond = (pow_result > _ibis_num((2**63) - 1)) | ( + pow_result < _ibis_num(-(2**63)) + ) + + return ( + ibis.case() + .when((overflow_cond), ibis.null()) + .else_(pow_result.cast(ibis_dtypes.int64)) + .end() + ) + + +def _float_pow_op( + x: ibis_types.Value, + y: ibis_types.Value, +): + # Most conditions here seek to prevent calling BQ POW with inputs that would generate errors. + # See: https://cloud.google.com/bigquery/docs/reference/standard-sql/mathematical_functions#pow + x_val = typing.cast(ibis_types.NumericValue, x) + y_val = typing.cast(ibis_types.NumericValue, y) + + overflow_cond = (x_val != _ZERO) & ((y_val * x_val.abs().ln()) > _FLOAT64_EXP_BOUND) + + # Float64 lose integer precision beyond 2**53, beyond this insufficient precision to get parity + exp_too_big = y_val.abs() > _ibis_num(2**53) + # Treat very large exponents as +=INF + norm_exp = exp_too_big.ifelse(_INF * y_val.sign(), y_val) + + pow_result = x_val**norm_exp + + # This cast is dangerous, need to only excuted where y_val has been bounds-checked + # Ibis needs try_cast binding to bq safe_cast + exponent_is_whole = y_val.cast(ibis_dtypes.int64) == y_val + odd_exponent = (x_val < _ZERO) & ( + y_val.cast(ibis_dtypes.int64) % _ibis_num(2) == _ibis_num(1) + ) + infinite_base = x_val.abs() == _INF + + return ( + ibis.case() + # Might be able to do something more clever with x_val==0 case + .when(y_val == _ZERO, _ibis_num(1)) + .when( + x_val == _ibis_num(1), _ibis_num(1) + ) # Need to ignore exponent, even if it is NA + .when( + (x_val == _ZERO) & (y_val < _ZERO), _INF + ) # This case would error POW function in BQ + .when(infinite_base, pow_result) + .when( + exp_too_big, pow_result + ) # Bigquery can actually handle the +-inf cases gracefully + .when((x_val < _ZERO) & (~exponent_is_whole), _NAN) + .when( + overflow_cond, _INF * odd_exponent.ifelse(_ibis_num(-1), _ibis_num(1)) + ) # finite overflows would cause bq to error + .else_(pow_result) + .end() + ) + + @short_circuit_nulls(ibis_dtypes.bool) def lt_op( x: ibis_types.Value, @@ -880,6 +984,15 @@ def partial_right(op: BinaryOp, scalar: typing.Any) -> UnaryOp: return BinopPartialRight(op, scalar) +NUMPY_TO_BINOP: typing.Final = { + np.add: add_op, + np.subtract: sub_op, + np.multiply: mul_op, + np.divide: div_op, + np.power: pow_op, +} + + # Ternary ops def where_op( original: ibis_types.Value, diff --git a/bigframes/pandas/__init__.py b/bigframes/pandas/__init__.py index 280fce1112..e1a23e67a1 100644 --- a/bigframes/pandas/__init__.py +++ b/bigframes/pandas/__init__.py @@ -52,6 +52,7 @@ import bigframes.series import bigframes.session import third_party.bigframes_vendored.pandas.core.reshape.concat as vendored_pandas_concat +import third_party.bigframes_vendored.pandas.core.reshape.merge as vendored_pandas_merge import third_party.bigframes_vendored.pandas.core.reshape.tile as vendored_pandas_tile @@ -132,6 +133,37 @@ def cut( cut.__doc__ = vendored_pandas_tile.cut.__doc__ +def merge( + left: DataFrame, + right: DataFrame, + how: Literal[ + "inner", + "left", + "outer", + "right", + ] = "inner", + on: Optional[str] = None, + *, + left_on: Optional[str] = None, + right_on: Optional[str] = None, + sort: bool = False, + suffixes: tuple[str, str] = ("_x", "_y"), +) -> DataFrame: + return bigframes.core.joins.merge( + left, + right, + how=how, + on=on, + left_on=left_on, + right_on=right_on, + sort=sort, + suffixes=suffixes, + ) + + +merge.__doc__ = vendored_pandas_merge.merge.__doc__ + + def _set_default_session_location_if_possible(query): # Set the location as per the query if this is the first query the user is # running and: @@ -390,7 +422,6 @@ def read_gbq_function(function_name: str): read_gbq_function.__doc__ = inspect.getdoc(bigframes.session.Session.read_gbq_function) - # pandas dtype attributes NA = pandas.NA BooleanDtype = pandas.BooleanDtype @@ -421,6 +452,7 @@ def read_gbq_function(function_name: str): __all___ = [ # Functions "concat", + "merge", "read_csv", "read_gbq", "read_gbq_function", diff --git a/bigframes/remote_function.py b/bigframes/remote_function.py index 2a4b919dab..6932e5b580 100644 --- a/bigframes/remote_function.py +++ b/bigframes/remote_function.py @@ -26,7 +26,6 @@ import sys import tempfile import textwrap -import time from typing import List, NamedTuple, Optional, Sequence, TYPE_CHECKING if TYPE_CHECKING: @@ -49,6 +48,7 @@ import ibis.expr.operations as ops import ibis.expr.rules as rlz +from bigframes import clients import bigframes.constants as constants # TODO(shobs): Change the min log level to INFO after the development stabilizes @@ -167,35 +167,22 @@ def __init__( self._bq_location = bq_location self._bq_dataset = bq_dataset self._bq_client = bq_client - self._bq_connection_client = bq_connection_client self._bq_connection_id = bq_connection_id - self._cloud_resource_manager_client = cloud_resource_manager_client + self._bq_connection_manager = clients.BqConnectionManager( + bq_connection_client, cloud_resource_manager_client + ) def create_bq_remote_function( self, input_args, input_types, output_type, endpoint, bq_function_name ): """Create a BigQuery remote function given the artifacts of a user defined function and the http endpoint of a corresponding cloud function.""" - # TODO(shobs): The below command to enable BigQuery Connection API needs - # to be automated. Disabling for now since most target users would not - # have the privilege to enable API in a project. - # log("Making sure BigQuery Connection API is enabled") - # if os.system("gcloud services enable bigqueryconnection.googleapis.com"): - # raise ValueError("Failed to enable BigQuery Connection API") - - # If the intended connection does not exist then create it - service_account_id = self.get_service_account_if_connection_exists() - if service_account_id: - logger.info(f"Connector {self._bq_connection_id} already exists") - else: - connection_name, service_account_id = self.create_bq_connection() - logger.info( - f"Created BQ connection {connection_name} with service account id: {service_account_id}" - ) - - # Ensure IAM role on the BQ connection - # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function - self._ensure_iam_binding(service_account_id, "run.invoker") + self._bq_connection_manager.create_bq_connection( + self._gcp_project_id, + self._bq_location, + self._bq_connection_id, + "run.invoker", + ) # Create BQ function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2 @@ -237,86 +224,6 @@ def get_cloud_function_endpoint(self, name): pass return None - # Introduce retries to accommodate transient errors like etag mismatch, - # which can be caused by concurrent operation on the same resource, and - # manifests with message like: - # google.api_core.exceptions.Aborted: 409 There were concurrent policy - # changes. Please retry the whole read-modify-write with exponential - # backoff. The request's ETag '\007\006\003,\264\304\337\272' did not match - # the current policy's ETag '\007\006\003,\3750&\363'. - @google.api_core.retry.Retry( - predicate=google.api_core.retry.if_exception_type( - google.api_core.exceptions.Aborted - ), - initial=10, - maximum=20, - multiplier=2, - timeout=60, - ) - def _ensure_iam_binding(self, service_account: str, role: str): - """Ensure necessary IAM role is configured on a service account.""" - project = f"projects/{self._gcp_project_id}" - service_account = f"serviceAccount:{service_account}" - role = f"roles/{role}" - request = google.iam.v1.iam_policy_pb2.GetIamPolicyRequest(resource=project) - policy = self._cloud_resource_manager_client.get_iam_policy(request=request) - - # Check if the binding already exists, and if does, do nothing more - for binding in policy.bindings: - if binding.role == role: - if service_account in binding.members: - return - - # Create a new binding - new_binding = google.iam.v1.policy_pb2.Binding( - role=role, members=[service_account] - ) - policy.bindings.append(new_binding) - request = google.iam.v1.iam_policy_pb2.SetIamPolicyRequest( - resource=project, policy=policy - ) - self._cloud_resource_manager_client.set_iam_policy(request=request) - - # We would wait for the IAM policy change to take effect - # https://cloud.google.com/iam/docs/access-change-propagation - logger.info( - f"Waiting {self._iam_wait_seconds} seconds for IAM to take effect.." - ) - time.sleep(self._iam_wait_seconds) - - def create_bq_connection(self): - """Create the BigQuery Connection and returns corresponding service account id.""" - client = self._bq_connection_client - connection = bigquery_connection_v1.Connection( - cloud_resource=bigquery_connection_v1.CloudResourceProperties() - ) - request = bigquery_connection_v1.CreateConnectionRequest( - parent=client.common_location_path(self._gcp_project_id, self._bq_location), - connection_id=self._bq_connection_id, - connection=connection, - ) - connection = client.create_connection(request) - return connection.name, connection.cloud_resource.service_account_id - - def get_service_account_if_connection_exists(self) -> Optional[str]: - """Check if the BigQuery Connection exists.""" - client = self._bq_connection_client - request = bigquery_connection_v1.GetConnectionRequest( - name=client.connection_path( - self._gcp_project_id, self._bq_location, self._bq_connection_id - ) - ) - - service_account = None - try: - service_account = client.get_connection( - request=request - ).cloud_resource.service_account_id - except google.api_core.exceptions.NotFound: - pass - - return service_account - def generate_udf_code(self, def_, dir): """Generate serialized bytecode using cloudpickle given a udf.""" udf_code_file_name = "udf.py" @@ -825,7 +732,7 @@ def remote_function( # A connection is required for BQ remote function # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function if not bigquery_connection and session: - bigquery_connection = session._remote_udf_connection # type: ignore + bigquery_connection = session._bq_connection # type: ignore if not bigquery_connection: raise ValueError( "BigQuery connection must be provided, either directly or via session. " diff --git a/bigframes/series.py b/bigframes/series.py index 49b0a5b1f0..8eadee37ed 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -387,9 +387,24 @@ def rank( ) -> Series: return Series(block_ops.rank(self._block, method, na_option, ascending)) - def fillna(self, value=None) -> "Series" | None: + def fillna(self, value=None) -> Series: return self._apply_binary_op(value, ops.fillna_op) + def dropna( + self, + *, + axis: int = 0, + inplace: bool = False, + how: typing.Optional[str] = None, + ignore_index: bool = False, + ) -> Series: + if inplace: + raise NotImplementedError("'inplace'=True not supported") + result = block_ops.dropna(self._block, how="any") + if ignore_index: + result = result.reset_index() + return Series(result) + def head(self, n: int = 5) -> Series: return typing.cast(Series, self.iloc[0:n]) @@ -547,6 +562,18 @@ def floordiv(self, other: float | int | Series) -> Series: def rfloordiv(self, other: float | int | Series) -> Series: return self._apply_binary_op(other, ops.reverse(ops.floordiv_op)) + def __pow__(self, other: float | int | Series) -> Series: + return self.pow(other) + + def __rpow__(self, other: float | int | Series) -> Series: + return self.rpow(other) + + def pow(self, other: float | int | Series) -> Series: + return self._apply_binary_op(other, ops.pow_op) + + def rpow(self, other: float | int | Series) -> Series: + return self._apply_binary_op(other, ops.reverse(ops.pow_op)) + def __lt__(self, other: float | int | Series) -> Series: # type: ignore return self.lt(other) @@ -843,23 +870,15 @@ def argmin(self) -> scalars.Scalar: @property def is_monotonic_increasing(self) -> bool: - period = 1 - window = bigframes.core.WindowSpec( - preceding=period, - following=None, + return typing.cast( + bool, self._block.is_monotonic_increasing(self._value_column) ) - shifted_series = self._apply_window_op(agg_ops.ShiftOp(period), window) - return self.notna().__and__(self >= shifted_series).all() @property def is_monotonic_decreasing(self) -> bool: - period = 1 - window = bigframes.core.WindowSpec( - preceding=period, - following=None, + return typing.cast( + bool, self._block.is_monotonic_decreasing(self._value_column) ) - shifted_series = self._apply_window_op(agg_ops.ShiftOp(period), window) - return self.notna().__and__(self <= shifted_series).all() def __getitem__(self, indexer): # TODO: enforce stricter alignment, should fail if indexer is missing any keys. @@ -1105,9 +1124,12 @@ def mask(self, cond, other=None) -> Series: ) return self.where(~cond, other) - def to_frame(self) -> bigframes.dataframe.DataFrame: + def to_frame(self, name: blocks.Label = None) -> bigframes.dataframe.DataFrame: + provided_name = name if name else self.name # To be consistent with Pandas, it assigns 0 as the column name if missing. 0 is the first element of RangeIndex. - block = self._block.with_column_labels([self.name] if self.name else ["0"]) + block = self._block.with_column_labels( + [provided_name] if provided_name else ["0"] + ) return bigframes.dataframe.DataFrame(block) def to_csv(self, path_or_buf=None, **kwargs) -> typing.Optional[str]: @@ -1191,6 +1213,57 @@ def to_string( def to_xarray(self): return self.to_pandas().to_xarray() + def _throw_if_index_contains_duplicates( + self, error_message: typing.Optional[str] = None + ) -> None: + duplicates_block, _ = block_ops.indicate_duplicates( + self._get_block(), self._get_block().index_columns + ) + duplicates_block = duplicates_block.with_column_labels( + ["values", "is_duplicate"] + ) + duplicates_df = bigframes.dataframe.DataFrame(duplicates_block) + if duplicates_df["is_duplicate"].any(): + error_message = ( + error_message + if error_message + else "Index contains duplicate entries, but uniqueness is required." + ) + raise pandas.errors.InvalidIndexError(error_message) + + def map( + self, + arg: typing.Union[Mapping, Series], + na_action: Optional[str] = None, + *, + verify_integrity: bool = False, + ) -> Series: + if na_action: + raise NotImplementedError( + f"Non-None na_action argument is not yet supported for Series.map. {constants.FEEDBACK_LINK}" + ) + if isinstance(arg, Series): + if verify_integrity: + error_message = "When verify_integrity is True in Series.map, index of arg parameter must not have duplicate entries." + arg._throw_if_index_contains_duplicates(error_message=error_message) + map_df = bigframes.dataframe.DataFrame(arg._block) + map_df = map_df.rename(columns={arg.name: self.name}) + elif isinstance(arg, Mapping): + map_df = bigframes.dataframe.DataFrame( + {"keys": list(arg.keys()), self.name: list(arg.values())}, + session=self._get_block().expr._session, + ) + map_df = map_df.set_index("keys") + elif callable(arg): + return self.apply(arg) + else: + # Mirroring pandas, call the uncallable object + arg() # throws TypeError: object is not callable + + self_df = self.to_frame(name="series") + result_df = self_df.join(map_df, on="series") + return result_df[self.name] + def __array_ufunc__( self, ufunc: numpy.ufunc, method: str, *inputs, **kwargs ) -> Series: @@ -1198,16 +1271,17 @@ def __array_ufunc__( See: https://numpy.org/doc/stable/reference/ufuncs.html """ # Only __call__ supported with zero arguments - if ( - inputs[0] is not self - or method != "__call__" - or len(inputs) > 1 - or len(kwargs) > 0 - ): + if method != "__call__" or len(inputs) > 2 or len(kwargs) > 0: return NotImplemented - if ufunc in ops.NUMPY_TO_OP: + if len(inputs) == 1 and ufunc in ops.NUMPY_TO_OP: return self._apply_unary_op(ops.NUMPY_TO_OP[ufunc]) + if len(inputs) == 2 and ufunc in ops.NUMPY_TO_BINOP: + binop = ops.NUMPY_TO_BINOP[ufunc] + if inputs[0] is self: + return self._apply_binary_op(inputs[1], binop) + else: + return self._apply_binary_op(inputs[0], ops.reverse(binop)) return NotImplemented diff --git a/bigframes/session.py b/bigframes/session.py index ac2f8fa53a..1744407772 100644 --- a/bigframes/session.py +++ b/bigframes/session.py @@ -270,9 +270,22 @@ class Session( third_party_pandas_readers.ReaderIOMixin, ): """Establishes a BigQuery connection to capture a group of job activities related to - DataFrames.""" + DataFrames. - def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): + Args: + context (bigframes._config.bigquery_options.BigQueryOptions): + Configuration adjusting how to connect to BigQuery and related + APIs. Note that some options are ignored if ``clients_provider`` is + set. + clients_provider (bigframes.session.ClientsProvider): + An object providing client library objects. + """ + + def __init__( + self, + context: Optional[bigquery_options.BigQueryOptions] = None, + clients_provider: Optional[ClientsProvider] = None, + ): if context is None: context = bigquery_options.BigQueryOptions() @@ -288,12 +301,15 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): # Instantiate a clients provider to help with cloud clients that will be # used in the future operations in the session - self._clients_provider = ClientsProvider( - project=context.project, - location=self._location, - use_regional_endpoints=context.use_regional_endpoints, - credentials=context.credentials, - ) + if clients_provider: + self._clients_provider = clients_provider + else: + self._clients_provider = ClientsProvider( + project=context.project, + location=self._location, + use_regional_endpoints=context.use_regional_endpoints, + credentials=context.credentials, + ) self._create_and_bind_bq_session() self.ibis_client = typing.cast( @@ -305,7 +321,7 @@ def __init__(self, context: Optional[bigquery_options.BigQueryOptions] = None): ), ) - self._remote_udf_connection = context.remote_udf_connection + self._bq_connection = context.bq_connection # Now that we're starting the session, don't allow the options to be # changed. @@ -381,10 +397,10 @@ def close(self): try: query_job = self.bqclient.query(abort_session_query) query_job.result() # blocks until finished - except google.api_core.exceptions.BadRequest as e: + except google.api_core.exceptions.BadRequest as exc: # Ignore the exception when the BQ session itself has expired # https://cloud.google.com/bigquery/docs/sessions-terminating#auto-terminate_a_session - if not e.message.startswith( + if not exc.message.startswith( f"Session {self._session_id} has expired and is no longer available." ): raise diff --git a/notebooks/experimental/longer_ml_demo.ipynb b/notebooks/experimental/longer_ml_demo.ipynb index c4d133421f..793ff58ecd 100644 --- a/notebooks/experimental/longer_ml_demo.ipynb +++ b/notebooks/experimental/longer_ml_demo.ipynb @@ -1373,9 +1373,9 @@ } ], "source": [ - "train_x = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "train_y = training_data[['body_mass_g']]\n", - "model.fit(train_x, train_y)\n", + "X_train = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", + "y_train = training_data[['body_mass_g']]\n", + "model.fit(X_train, y_train)\n", "model" ] }, @@ -1453,7 +1453,7 @@ } ], "source": [ - "model.score(train_x, train_y)" + "model.score(X_train, y_train)" ] }, { @@ -1490,12 +1490,12 @@ "# lets define a preprocessing step that adjust the linear measurements to use the cube\n", "'''\n", "def cubify(penguin_df):\n", - " penguin_df.culmen_length_mm = train_x.culmen_length_mm.pow(3)\n", - " penguin_df.culmen_depth_mm = train_x.culmen_depth_mm.pow(3)\n", - " penguin_df.flipper_length_mm = train_x.flipper_length_mm.pow(3)\n", + " penguin_df.culmen_length_mm = X_train.culmen_length_mm.pow(3)\n", + " penguin_df.culmen_depth_mm = X_train.culmen_depth_mm.pow(3)\n", + " penguin_df.flipper_length_mm = X_train.flipper_length_mm.pow(3)\n", "\n", - "cubify(train_x)\n", - "train_x\n", + "cubify(X_train)\n", + "X_train\n", "'''" ] }, @@ -1519,7 +1519,7 @@ "source": [ "# AS ABOVE, SKIP FOR NOW\n", "'''\n", - "model.fit(train_x, train_y)\n", + "model.fit(X_train, y_train)\n", "model.evaluate()\n", "'''" ] @@ -1756,15 +1756,15 @@ "training_data = training_data.dropna()\n", "\n", "# And we'll include species in our features\n", - "train_x = training_data[['species', 'island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "train_y = training_data[['body_mass_g']]\n", - "model.fit(train_x, train_y)\n", + "X_train = training_data[['species', 'island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", + "y_train = training_data[['body_mass_g']]\n", + "model.fit(X_train, y_train)\n", "\n", "# And we'll evaluate it on the Adelie penguins only\n", "adelie_data = training_data[training_data.species == \"Adelie Penguin (Pygoscelis adeliae)\"]\n", - "test_x = adelie_data[['species', 'island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", - "test_y = adelie_data[['body_mass_g']]\n", - "model.score(test_x, test_y)" + "X_test = adelie_data[['species', 'island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", + "y_test = adelie_data[['body_mass_g']]\n", + "model.score(X_test, y_test)" ] }, { @@ -1852,7 +1852,7 @@ " ('linreg', LinearRegression())\n", "])\n", "\n", - "pipe.fit(train_x, train_y)\n", + "pipe.fit(X_train, y_train)\n", "pipe.evaluate()" ] }, diff --git a/notebooks/getting_started/ml_fundamentals.ipynb b/notebooks/getting_started/ml_fundamentals.ipynb index ade50bcbc2..e48aff1d57 100644 --- a/notebooks/getting_started/ml_fundamentals.ipynb +++ b/notebooks/getting_started/ml_fundamentals.ipynb @@ -567,10 +567,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "train_X shape: (267, 6)\n", - "test_X shape: (67, 6)\n", - "train_y shape: (267, 1)\n", - "test_y shape: (67, 1)\n" + "X_train shape: (267, 6)\n", + "X_test shape: (67, 6)\n", + "y_train shape: (267, 1)\n", + "y_test shape: (67, 1)\n" ] } ], @@ -584,14 +584,14 @@ "\n", "# This will split X and y into test and training sets, with 20% of the rows in the test set,\n", "# and the rest in the training set\n", - "train_X, test_X, train_y, test_y = train_test_split(\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", " X, y, test_size=0.2)\n", "\n", "# Show the shape of the data after the split\n", - "print(f\"\"\"train_X shape: {train_X.shape}\n", - "test_X shape: {test_X.shape}\n", - "train_y shape: {train_y.shape}\n", - "test_y shape: {test_y.shape}\"\"\")" + "print(f\"\"\"X_train shape: {X_train.shape}\n", + "X_test shape: {X_test.shape}\n", + "y_train shape: {y_train.shape}\n", + "y_test shape: {y_test.shape}\"\"\")" ] }, { @@ -758,7 +758,7 @@ "source": [ "# If we look at the data, we can see that random rows were selected for\n", "# each side of the split\n", - "test_X.head(5)" + "X_test.head(5)" ] }, { @@ -880,8 +880,8 @@ } ], "source": [ - "# Note that this matches the rows in test_X\n", - "test_y.head(5)" + "# Note that this matches the rows in X_test\n", + "y_test.head(5)" ] }, { @@ -1242,11 +1242,11 @@ "numeric_columns = [\"culmen_length_mm\", \"culmen_depth_mm\", \"flipper_length_mm\"]\n", "\n", "scaler = StandardScaler()\n", - "scaler.fit(train_X[numeric_columns])\n", + "scaler.fit(X_train[numeric_columns])\n", "\n", "# Now, standardscaler should transform the numbers to have mean of zero\n", "# and standard deviation of one:\n", - "scaler.transform(train_X[numeric_columns])" + "scaler.transform(X_train[numeric_columns])" ] }, { @@ -1580,7 +1580,7 @@ ], "source": [ "# We can then repeat this transformation on new data\n", - "scaler.transform(test_X[numeric_columns])" + "scaler.transform(X_test[numeric_columns])" ] }, { @@ -2070,12 +2070,12 @@ " (\"encode\", OneHotEncoder(), [\"species\", \"sex\", \"island\"])])\n", "\n", "# Now we can fit all columns of the training data\n", - "preproc.fit(train_X)\n", + "preproc.fit(X_train)\n", "\n", - "processed_train_X = preproc.transform(train_X)\n", - "processed_test_X = preproc.transform(test_X)\n", + "processed_X_train = preproc.transform(X_train)\n", + "processed_X_test = preproc.transform(X_test)\n", "\n", - "processed_train_X" + "processed_X_train" ] }, { @@ -2347,12 +2347,12 @@ "linreg = LinearRegression()\n", "\n", "# Learn from the training data how to predict output y\n", - "linreg.fit(processed_train_X, train_y)\n", + "linreg.fit(processed_X_train, y_train)\n", "\n", "# Predict y for the test data\n", - "predicted_test_y = linreg.predict(processed_test_X)\n", + "predicted_y_test = linreg.predict(processed_X_test)\n", "\n", - "predicted_test_y" + "predicted_y_test" ] }, { @@ -2617,9 +2617,9 @@ "\n", "kmeans = KMeans(n_clusters=4)\n", "\n", - "kmeans.fit(processed_train_X)\n", + "kmeans.fit(processed_X_train)\n", "\n", - "kmeans.predict(processed_test_X)" + "kmeans.predict(processed_X_test)" ] }, { @@ -2924,10 +2924,10 @@ } ], "source": [ - "pipeline.fit(train_X, train_y)\n", + "pipeline.fit(X_train, y_train)\n", "\n", - "predicted_test_y = pipeline.predict(test_X)\n", - "predicted_test_y" + "predicted_y_test = pipeline.predict(X_test)\n", + "predicted_y_test" ] }, { @@ -3084,7 +3084,7 @@ ], "source": [ "# In the case of a pipeline, this will be equivalent to calling .score on the contained LinearRegression\n", - "pipeline.score(test_X, test_y)" + "pipeline.score(X_test, y_test)" ] }, { @@ -3156,7 +3156,7 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "r2_score(test_y, predicted_test_y)" + "r2_score(y_test, predicted_y_test)" ] }, { diff --git a/notebooks/regression/sklearn_linear_regression.ipynb b/notebooks/regression/sklearn_linear_regression.ipynb index 8c0a21c0ff..beb77ef092 100644 --- a/notebooks/regression/sklearn_linear_regression.ipynb +++ b/notebooks/regression/sklearn_linear_regression.ipynb @@ -867,7 +867,7 @@ "feature_columns = training_data[['island', 'culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'sex']]\n", "label_columns = training_data[['body_mass_g']] \n", "\n", - "train_X, test_X, train_y, test_y = train_test_split(\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", " feature_columns, label_columns, test_size=0.2)" ] }, @@ -940,7 +940,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline.fit(train_X, train_y)" + "pipeline.fit(X_train, y_train)" ] }, { @@ -1040,9 +1040,9 @@ "source": [ "from bigframes.ml.metrics import r2_score\n", "\n", - "pred_y = pipeline.predict(test_X)\n", + "pred_y = pipeline.predict(X_test)\n", "\n", - "r2_score(test_y, pred_y)" + "r2_score(y_test, pred_y)" ] }, { diff --git a/notebooks/remote_functions/remote_function.ipynb b/notebooks/remote_functions/remote_function.ipynb index 76c87f8629..06be0e7293 100644 --- a/notebooks/remote_functions/remote_function.ipynb +++ b/notebooks/remote_functions/remote_function.ipynb @@ -2,27 +2,38 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "3613b1cd", "metadata": {}, "outputs": [], "source": [ + "# BigQuery table data on which notebook should be run\n", "TABLE='bigquery-public-data.stackoverflow.comments'\n", - "MAX_ROWS=1000000 # 1 Million" + "\n", + "# Change this up to test the scale, down to run the notebook faster\n", + "MAX_ROWS=100000" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "f1175247", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 30.5 s, sys: 4.25 s, total: 34.8 s\n", - "Wall time: 1min 49s\n" + "CPU times: user 25.4 s, sys: 2.5 s, total: 27.9 s\n", + "Wall time: 2min 31s\n" ] }, { @@ -130,7 +141,7 @@ "9 154 Sure, but what about a solution using O(1) mem... 8" ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -149,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "fd8a04a3", "metadata": {}, "outputs": [], @@ -180,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "2b5e4568", "metadata": {}, "outputs": [ @@ -188,8 +199,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.48 s, sys: 27.6 ms, total: 5.51 s\n", - "Wall time: 5.49 s\n" + "CPU times: user 4.22 s, sys: 18.2 ms, total: 4.24 s\n", + "Wall time: 4.26 s\n" ] }, { @@ -308,7 +319,7 @@ "9 154 Sure, but what about a solution using O(1) mem... 8 19" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -322,18 +333,84 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "b81feaef", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n", + "/usr/local/google/home/shobs/code/bigframes1/venv/lib/python3.10/site-packages/google/auth/_default.py:78: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a \"quota exceeded\" or \"API not enabled\" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. \n", + " warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2b1c9d671db14d2ca3be6a0b0c698430", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 6b0a39de-40a0-4dd4-be88-248bd8ebcd77 is RUNNING. \n", " 0\n", " \n", " \n", " 1\n", - " 35156124\n", - " Sorry I didn't include my timeout method befor...\n", + " 11013760\n", + " You *should* be concerned with the disk being ...\n", " 0\n", " \n", " \n", " 2\n", - " 35157401\n", - " As soon as I defined some sort of primary Key ...\n", + " 11013784\n", + " have you looked at `Integrate` or `NIntegrate`?\n", " 0\n", " \n", " \n", " 3\n", - " 35158649\n", - " @user3355243 I've edited it to give $values an...\n", + " 11015512\n", + " sorry, is a typo. The variable name is dist. (...\n", " 0\n", " \n", " \n", " 4\n", - " 35162039\n", - " I pasted my exes @Matt.\n", + " 11016238\n", + " Pfff, I'm having trouble with that formula too...\n", " 0\n", " \n", " \n", " 5\n", - " 35162396\n", - " @Gene - I do have separate fields, but I also ...\n", + " 11016276\n", + " Thanks thinksteep! Does this mean that by usin...\n", " 0\n", " \n", " \n", " 6\n", - " 35162907\n", - " could you please provide any kind of sketch ho...\n", + " 11016551\n", + " Jason, thanks for the reply. I've been workin...\n", " 0\n", " \n", " \n", " 7\n", - " 35166498\n", - " We use PhoneGap. Using Angular.\n", + " 11017973\n", + " I assume an `off` of 0.5 would put be exactly ...\n", " 0\n", " \n", " \n", " 8\n", - " 35170138\n", - " `decltype((int(Foo::*)(int))(&Foo::foo))` That...\n", + " 11018225\n", + " Thank you very much. I do worry too much abou...\n", " 0\n", " \n", " \n", " 9\n", - " 35172348\n", - " Yes, I found that one already. And I understan...\n", + " 11018370\n", + " @IanClelland, I edited my question a bit. The ...\n", " 0\n", " \n", " \n", "\n", + "

10 rows × 3 columns

\n", "[10 rows x 3 columns in total]" ], "text/plain": [ " id text score\n", - "0 35153602 Are you trying to access a nested array? 0\n", - "1 35156124 Sorry I didn't include my timeout method befor... 0\n", - "2 35157401 As soon as I defined some sort of primary Key ... 0\n", - "3 35158649 @user3355243 I've edited it to give $values an... 0\n", - "4 35162039 I pasted my exes @Matt. 0\n", - "5 35162396 @Gene - I do have separate fields, but I also ... 0\n", - "6 35162907 could you please provide any kind of sketch ho... 0\n", - "7 35166498 We use PhoneGap. Using Angular. 0\n", - "8 35170138 `decltype((int(Foo::*)(int))(&Foo::foo))` That... 0\n", - "9 35172348 Yes, I found that one already. And I understan... 0\n", + "0 11012908 you're welcome! according to the docs it shoul... 0\n", + "1 11013760 You *should* be concerned with the disk being ... 0\n", + "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0\n", + "3 11015512 sorry, is a typo. The variable name is dist. (... 0\n", + "4 11016238 Pfff, I'm having trouble with that formula too... 0\n", + "5 11016276 Thanks thinksteep! Does this mean that by usin... 0\n", + "6 11016551 Jason, thanks for the reply. I've been workin... 0\n", + "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0\n", + "8 11018225 Thank you very much. I do worry too much abou... 0\n", + "9 11018370 @IanClelland, I edited my question a bit. The ... 0\n", "\n", "[10 rows x 3 columns]" ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -461,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "55ed241e", "metadata": {}, "outputs": [ @@ -474,59 +552,73 @@ "remote_function(input_types: 'List[type]', output_type: 'type', dataset: 'Optional[str]' = None, bigquery_connection: 'Optional[str]' = None, reuse: 'bool' = True)\n", " Decorator to turn a user defined function into a BigQuery remote function.\n", " \n", + " .. note::\n", + " Please make sure following is setup before using this API:\n", + " \n", + " 1. Have the below APIs enabled for your project:\n", + " \n", + " * BigQuery Connection API\n", + " * Cloud Functions API\n", + " * Cloud Run API\n", + " * Cloud Build API\n", + " * Artifact Registry API\n", + " * Cloud Resource Manager API\n", + " \n", + " This can be done from the cloud console (change `PROJECT_ID` to yours):\n", + " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n", + " \n", + " Or from the gcloud CLI:\n", + " \n", + " `$ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com`\n", + " \n", + " 2. Have following IAM roles enabled for you:\n", + " \n", + " * BigQuery Data Editor (roles/bigquery.dataEditor)\n", + " * BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", + " * Cloud Functions Developer (roles/cloudfunctions.developer)\n", + " * Service Account User (roles/iam.serviceAccountUser)\n", + " * Storage Object Viewer (roles/storage.objectViewer)\n", + " * Project IAM Admin (roles/resourcemanager.projectIamAdmin) (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", + " \n", + " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n", + " \n", + " 1. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n", + " 2. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n", + " \n", + " Alternatively, the IAM could also be setup via the gcloud CLI:\n", + " \n", + " `$ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"`.\n", + " \n", " Args:\n", - " input_types : list(type)\n", + " input_types (list(type)):\n", " List of input data types in the user defined function.\n", - " output_type : type\n", + " output_type (type):\n", " Data type of the output in the user defined function.\n", - " dataset : str, Optional\n", - " Dataset to use to create a BigQuery function. It should be in\n", + " dataset (str, Optional):\n", + " Dataset in which to create a BigQuery remote function. It should be in\n", " `.` or `` format. If this\n", - " param is not provided then session dataset id would be used.\n", - " bigquery_connection : str, Optional\n", - " Name of the BigQuery connection. If it is pre created in the same\n", - " location as the `bigquery_client.location` then it would be used,\n", - " otherwise it would be created dynamically assuming the user has\n", - " necessary priviliges. If this param is not provided then the\n", - " bigquery connection from the session would be used.\n", - " reuse : bool, Optional\n", + " parameter is not provided then session dataset id is used.\n", + " bigquery_connection (str, Optional):\n", + " Name of the BigQuery connection. You should either have the\n", + " connection already created in the `location` you have chosen, or\n", + " you should have the Project IAM Admin role to enable the service\n", + " to create the connection for you if you need it.If this parameter is\n", + " not provided then the BigQuery connection from the session is used.\n", + " reuse (bool, Optional):\n", " Reuse the remote function if already exists.\n", " `True` by default, which will result in reusing an existing remote\n", " function (if any) that was previously created for the same udf.\n", " Setting it to false would force creating a unique remote function.\n", " If the required remote function does not exist then it would be\n", " created irrespective of this param.\n", + " Returns:\n", + " callable: A remote function object pointing to the cloud assets created\n", + " in the background to support the remote execution. The cloud assets can be\n", + " located through the following properties set in the object:\n", " \n", - " Notes:\n", - " Please make sure following is setup before using this API:\n", - " \n", - " 1. Have the below APIs enabled for your project:\n", - " a. BigQuery Connection API\n", - " b. Cloud Functions API\n", - " c. Cloud Run API\n", - " d. Cloud Build API\n", - " e. Artifact Registry API\n", - " f. Cloud Resource Manager API\n", + " `bigframes_cloud_function` - The google cloud function deployed for the user defined code.\n", " \n", - " This can be done from the cloud console (change PROJECT_ID to yours):\n", - " https://console.cloud.google.com/apis/enableflow?apiid=bigqueryconnection.googleapis.com,cloudfunctions.googleapis.com,run.googleapis.com,cloudbuild.googleapis.com,artifactregistry.googleapis.com,cloudresourcemanager.googleapis.com&project=PROJECT_ID\n", - " Or from the gcloud CLI:\n", - " $ gcloud services enable bigqueryconnection.googleapis.com cloudfunctions.googleapis.com run.googleapis.com cloudbuild.googleapis.com artifactregistry.googleapis.com cloudresourcemanager.googleapis.com\n", - " \n", - " 2. Have following IAM roles enabled for you:\n", - " a. BigQuery Data Editor (roles/bigquery.dataEditor)\n", - " b. BigQuery Connection Admin (roles/bigquery.connectionAdmin)\n", - " c. Cloud Functions Developer (roles/cloudfunctions.developer)\n", - " d. Service Account User (roles/iam.serviceAccountUser)\n", - " e. Storage Object Viewer (roles/storage.objectViewer)\n", - " f. Project IAM Admin (roles/resourcemanager.projectIamAdmin)\n", - " (Only required if the bigquery connection being used is not pre-created and is created dynamically with user credentials.)\n", - " \n", - " 3. Either the user has setIamPolicy privilege on the project, or a BigQuery connection is pre-created with necessary IAM role set:\n", - " a. To create a connection, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_connection\n", - " b. To set up IAM, follow https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#grant_permission_on_function\n", - " Alternatively, the IAM could also be setup via the gcloud CLI:\n", - " $ gcloud projects add-iam-policy-binding PROJECT_ID --member=\"serviceAccount:CONNECTION_SERVICE_ACCOUNT_ID\" --role=\"roles/run.invoker\"\n", + " `bigframes_remote_function` - The bigquery remote function capable of calling into `bigframes_cloud_function`.\n", "\n" ] } @@ -539,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c9a8d03d", "metadata": {}, "outputs": [], @@ -555,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "fbc27f81", "metadata": {}, "outputs": [ @@ -563,69 +655,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "[INFO][2023-06-29 01:03:30,557][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-ac72c931423f68dddb4f84f6754e2b28 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmposspvw3v --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", - "Preparing function...\n", - ".done.\n", - "Deploying function...\n", - "[Build]......................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................done\n", - "[Service]....................................................................................................done\n", - "Done.\n", - "You can view your function in the Cloud Console here: https://console.cloud.google.com/functions/details/us-central1/bigframes-ac72c931423f68dddb4f84f6754e2b28?project=bigframes-dev\n", - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "buildConfig:\n", - " build: projects/1084210331973/locations/us-central1/builds/57859622-4287-47bc-ab86-3cd3f4c47548\n", - " entryPoint: udf_http\n", - " runtime: python310\n", - " source:\n", - " storageSource:\n", - " bucket: gcf-v2-sources-1084210331973-us-central1\n", - " object: bigframes-ac72c931423f68dddb4f84f6754e2b28/function-source.zip\n", - " sourceProvenance:\n", - " resolvedStorageSource:\n", - " bucket: gcf-v2-sources-1084210331973-us-central1\n", - " generation: '1688000614202977'\n", - " object: bigframes-ac72c931423f68dddb4f84f6754e2b28/function-source.zip\n", - "environment: GEN_2\n", - "labels:\n", - " deployment-tool: cli-gcloud\n", - "name: projects/bigframes-dev/locations/us-central1/functions/bigframes-ac72c931423f68dddb4f84f6754e2b28\n", - "serviceConfig:\n", - " allTrafficOnLatestRevision: true\n", - " availableCpu: '0.1666'\n", - " availableMemory: 256M\n", - " ingressSettings: ALLOW_ALL\n", - " maxInstanceCount: 100\n", - " maxInstanceRequestConcurrency: 1\n", - " revision: bigframes-ac72c931423f68dddb4f84f6754e2b28-00001-faf\n", - " service: projects/bigframes-dev/locations/us-central1/services/bigframes-ac72c931423f68dddb4f84f6754e2b28\n", - " serviceAccountEmail: 1084210331973-compute@developer.gserviceaccount.com\n", - " timeoutSeconds: 60\n", - " uri: https://bigframes-ac72c931423f68dddb4f84f6754e2b28-7krlje3eoq-uc.a.run.app\n", - "state: ACTIVE\n", - "updateTime: '2023-06-29T01:04:34.306070995Z'\n", - "url: https://us-central1-bigframes-dev.cloudfunctions.net/bigframes-ac72c931423f68dddb4f84f6754e2b28\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[INFO][2023-06-29 01:04:37,239][bigframes.remote_function] Successfully created cloud function bigframes-ac72c931423f68dddb4f84f6754e2b28 with uri (https://bigframes-ac72c931423f68dddb4f84f6754e2b28-7krlje3eoq-uc.a.run.app)\n", - "[INFO][2023-06-29 01:04:42,402][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", - "[INFO][2023-06-29 01:04:42,404][bigframes.remote_function] Creating BQ remote function: \n", - " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_ac72c931423f68dddb4f84f6754e2b28(n INT64)\n", - " RETURNS INT64\n", - " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", - " OPTIONS (\n", - " endpoint = \"https://bigframes-ac72c931423f68dddb4f84f6754e2b28-7krlje3eoq-uc.a.run.app/"\n", - " )\n", - "[INFO][2023-06-29 01:04:43,456][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_ac72c931423f68dddb4f84f6754e2b28\n" + "[INFO][2023-08-18 21:23:29,687][bigframes.remote_function] Creating new cloud function: gcloud functions deploy bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 --gen2 --runtime=python310 --project=bigframes-dev --region=us-central1 --source=/tmp/tmpl2ewfnue --entry-point=udf_http --trigger-http --no-allow-unauthenticated\n", + "[INFO][2023-08-18 21:24:43,689][bigframes.remote_function] Successfully created cloud function bigframes-b0feb1fbaf8188b64d7e70118d93c5d4 with uri (https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app)\n", + "[INFO][2023-08-18 21:24:57,348][bigframes.remote_function] Connector bigframes-rf-conn already exists\n", + "[INFO][2023-08-18 21:24:57,351][bigframes.remote_function] Creating BQ remote function: \n", + " CREATE OR REPLACE FUNCTION `bigframes-dev.bigframes_temp_us`.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4(n INT64)\n", + " RETURNS INT64\n", + " REMOTE WITH CONNECTION `bigframes-dev.us.bigframes-rf-conn`\n", + " OPTIONS (\n", + " endpoint = \"https://bigframes-b0feb1fbaf8188b64d7e70118d93c5d4-7krlje3eoq-uc.a.run.app/"\n", + " )\n", + "[INFO][2023-08-18 21:24:58,300][bigframes.remote_function] Created remote function bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n" ] }, { @@ -633,7 +673,7 @@ "output_type": "stream", "text": [ "\n", - "Wall time: 76.2659 s\n" + "Wall time: 89.0601 s\n" ] } ], @@ -672,7 +712,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "c1c9355f", "metadata": {}, "outputs": [ @@ -680,10 +720,52 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 22.2 ms, sys: 0 ns, total: 22.2 ms\n", - "Wall time: 22.4 ms\n" + "CPU times: user 16.8 ms, sys: 61 µs, total: 16.8 ms\n", + "Wall time: 17 ms\n" ] }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2f840ad27c514ed19c759a004b32de33", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job 0f421233-9d02-4746-bb39-86a3b0880aba is RUNNING.
\n", " \n", " 0\n", - " 35153602\n", - " Are you trying to access a nested array?\n", + " 11012908\n", + " you're welcome! according to the docs it shoul...\n", " 0\n", " -1\n", " \n", " \n", " 1\n", - " 35156124\n", - " Sorry I didn't include my timeout method befor...\n", + " 11013760\n", + " You *should* be concerned with the disk being ...\n", " 0\n", " -1\n", " \n", " \n", " 2\n", - " 35157401\n", - " As soon as I defined some sort of primary Key ...\n", + " 11013784\n", + " have you looked at `Integrate` or `NIntegrate`?\n", " 0\n", " -1\n", " \n", " \n", " 3\n", - " 35158649\n", - " @user3355243 I've edited it to give $values an...\n", + " 11015512\n", + " sorry, is a typo. The variable name is dist. (...\n", " 0\n", " -1\n", " \n", " \n", " 4\n", - " 35162039\n", - " I pasted my exes @Matt.\n", + " 11016238\n", + " Pfff, I'm having trouble with that formula too...\n", " 0\n", " -1\n", " \n", " \n", " 5\n", - " 35162396\n", - " @Gene - I do have separate fields, but I also ...\n", + " 11016276\n", + " Thanks thinksteep! Does this mean that by usin...\n", " 0\n", " -1\n", " \n", " \n", " 6\n", - " 35162907\n", - " could you please provide any kind of sketch ho...\n", + " 11016551\n", + " Jason, thanks for the reply. I've been workin...\n", " 0\n", " -1\n", " \n", " \n", " 7\n", - " 35166498\n", - " We use PhoneGap. Using Angular.\n", + " 11017973\n", + " I assume an `off` of 0.5 would put be exactly ...\n", " 0\n", " -1\n", " \n", " \n", " 8\n", - " 35170138\n", - " `decltype((int(Foo::*)(int))(&Foo::foo))` That...\n", + " 11018225\n", + " Thank you very much. I do worry too much abou...\n", " 0\n", " -1\n", " \n", " \n", " 9\n", - " 35172348\n", - " Yes, I found that one already. And I understan...\n", + " 11018370\n", + " @IanClelland, I edited my question a bit. The ...\n", " 0\n", " -1\n", " \n", " \n", "\n", + "

10 rows × 4 columns

\n", "[10 rows x 4 columns in total]" ], "text/plain": [ " id text score n_prime\n", - "0 35153602 Are you trying to access a nested array? 0 -1\n", - "1 35156124 Sorry I didn't include my timeout method befor... 0 -1\n", - "2 35157401 As soon as I defined some sort of primary Key ... 0 -1\n", - "3 35158649 @user3355243 I've edited it to give $values an... 0 -1\n", - "4 35162039 I pasted my exes @Matt. 0 -1\n", - "5 35162396 @Gene - I do have separate fields, but I also ... 0 -1\n", - "6 35162907 could you please provide any kind of sketch ho... 0 -1\n", - "7 35166498 We use PhoneGap. Using Angular. 0 -1\n", - "8 35170138 `decltype((int(Foo::*)(int))(&Foo::foo))` That... 0 -1\n", - "9 35172348 Yes, I found that one already. And I understan... 0 -1\n", + "0 11012908 you're welcome! according to the docs it shoul... 0 -1\n", + "1 11013760 You *should* be concerned with the disk being ... 0 -1\n", + "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 -1\n", + "3 11015512 sorry, is a typo. The variable name is dist. (... 0 -1\n", + "4 11016238 Pfff, I'm having trouble with that formula too... 0 -1\n", + "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 -1\n", + "6 11016551 Jason, thanks for the reply. I've been workin... 0 -1\n", + "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 -1\n", + "8 11018225 Thank you very much. I do worry too much abou... 0 -1\n", + "9 11018370 @IanClelland, I edited my question a bit. The ... 0 -1\n", "\n", "[10 rows x 4 columns]" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -810,9 +893,299 @@ "source": [ "%%time\n", "\n", + "# Let's apply the function to the dataframe\n", "df = df.assign(n_prime=df['score'].apply(nth_prime))\n", "df.head(10)" ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2701cb81", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "bigframes-dev.bigframes_temp_us.bigframes_b0feb1fbaf8188b64d7e70118d93c5d4\n", + "projects/bigframes-dev/locations/us-central1/functions/bigframes-b0feb1fbaf8188b64d7e70118d93c5d4\n" + ] + } + ], + "source": [ + "# We can see the path to the BQ remote function and the google cloud function\n", + "# that was created under the hood\n", + "print(nth_prime.bigframes_remote_function)\n", + "print(nth_prime.bigframes_cloud_function)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "920fa18e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Help on function read_gbq_function in module bigframes.pandas:\n", + "\n", + "read_gbq_function(function_name: 'str')\n", + " Loads a BigQuery function from BigQuery.\n", + " \n", + " Then it can be applied to a DataFrame or Series.\n", + " \n", + " Args:\n", + " function_name (str):\n", + " the function's name in BigQuery in the format\n", + " `project_id.dataset_id.function_name`, or\n", + " `dataset_id.function_name` to load from the default project, or\n", + " `function_name` to load from the default project and the dataset\n", + " associated with the current session.\n", + " \n", + " Returns:\n", + " callable: A function object pointing to the BigQuery function read\n", + " from BigQuery.\n", + " \n", + " The object is similar to the one created by the `remote_function`\n", + " decorator, including the `bigframes_remote_function` property, but\n", + " not including the `bigframes_cloud_function` property.\n", + "\n" + ] + } + ], + "source": [ + "# Let's try to simulate a scenario in which user shares this remote funciton to\n", + "# their colleague who simply wants to reuse it. BigFrames provides an API to do\n", + "# so via `read_gbq_function`. Usage details are available via `help` command.\n", + "help(pd.read_gbq_function)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "a6c9da0a", + "metadata": {}, + "outputs": [], + "source": [ + "EXISTING_REMOTE_FUNCTION=nth_prime.bigframes_remote_function\n", + "\n", + "# Let's read the existing remote function in bigframes\n", + "nth_prime_existing = pd.read_gbq_function(EXISTING_REMOTE_FUNCTION)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d7e7de7f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 10.9 ms, sys: 0 ns, total: 10.9 ms\n", + "Wall time: 11.4 ms\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "73d1a73593cb4115821ab128c221a48d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='Query job bec5f7d1-3df1-4292-8c68-c396bce7dc5d is RUNNING.
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtextscoren_primen_prime_again
011012908you're welcome! according to the docs it shoul...0-1-1
111013760You *should* be concerned with the disk being ...0-1-1
211013784have you looked at `Integrate` or `NIntegrate`?0-1-1
311015512sorry, is a typo. The variable name is dist. (...0-1-1
411016238Pfff, I'm having trouble with that formula too...0-1-1
511016276Thanks thinksteep! Does this mean that by usin...0-1-1
611016551Jason, thanks for the reply. I've been workin...0-1-1
711017973I assume an `off` of 0.5 would put be exactly ...0-1-1
811018225Thank you very much. I do worry too much abou...0-1-1
911018370@IanClelland, I edited my question a bit. The ...0-1-1
\n", + "

10 rows × 5 columns

\n", + "[10 rows x 5 columns in total]" + ], + "text/plain": [ + " id text score \\\n", + "0 11012908 you're welcome! according to the docs it shoul... 0 \n", + "1 11013760 You *should* be concerned with the disk being ... 0 \n", + "2 11013784 have you looked at `Integrate` or `NIntegrate`? 0 \n", + "3 11015512 sorry, is a typo. The variable name is dist. (... 0 \n", + "4 11016238 Pfff, I'm having trouble with that formula too... 0 \n", + "5 11016276 Thanks thinksteep! Does this mean that by usin... 0 \n", + "6 11016551 Jason, thanks for the reply. I've been workin... 0 \n", + "7 11017973 I assume an `off` of 0.5 would put be exactly ... 0 \n", + "8 11018225 Thank you very much. I do worry too much abou... 0 \n", + "9 11018370 @IanClelland, I edited my question a bit. The ... 0 \n", + "\n", + " n_prime n_prime_again \n", + "0 -1 -1 \n", + "1 -1 -1 \n", + "2 -1 -1 \n", + "3 -1 -1 \n", + "4 -1 -1 \n", + "5 -1 -1 \n", + "6 -1 -1 \n", + "7 -1 -1 \n", + "8 -1 -1 \n", + "9 -1 -1 \n", + "\n", + "[10 rows x 5 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "\n", + "# Let's apply the existing function to the dataframe\n", + "df = df.assign(n_prime_again=df['score'].apply(nth_prime_existing))\n", + "df.head(10)" + ] } ], "metadata": { @@ -831,7 +1204,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/tests/system/conftest.py b/tests/system/conftest.py index bc94e32e12..3153bd1559 100644 --- a/tests/system/conftest.py +++ b/tests/system/conftest.py @@ -849,7 +849,7 @@ def floats_pd(): { "float64_col": [ float("-inf"), - float("-inf"), + float("inf"), float("nan"), float(-234239487.4), float(-1.0), @@ -863,17 +863,33 @@ def floats_pd(): float(math.e), float(math.pi), float(234239487.4), + float(1.23124 * (2**70)), pd.NA, ] }, dtype=pd.Float64Dtype(), ) + # Index helps debug failed cases df.index = df.float64_col # Upload fails if index name same as column name df.index.name = None return df.float64_col +@pytest.fixture() +def floats_product_pd(floats_pd): + df = pd.merge(floats_pd, floats_pd, how="cross") + # Index helps debug failed cases + df = df.set_index([df.float64_col_x, df.float64_col_y]) + df.index.names = ["left", "right"] + return df + + @pytest.fixture() def floats_bf(session, floats_pd): return session.read_pandas(floats_pd.to_frame()).float64_col + + +@pytest.fixture() +def floats_product_bf(session, floats_product_pd): + return session.read_pandas(floats_product_pd) diff --git a/tests/system/large/ml/test_core.py b/tests/system/large/ml/test_core.py index ab33e5d718..133af2dae4 100644 --- a/tests/system/large/ml/test_core.py +++ b/tests/system/large/ml/test_core.py @@ -20,7 +20,7 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_df): df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -30,10 +30,10 @@ def test_bqml_e2e(session, dataset_id, penguins_df_default_index, new_penguins_d "sex", ] ] - train_y = df[["body_mass_g"]] + y_train = df[["body_mass_g"]] model = bigframes.ml.core.create_bqml_model( - train_X, train_y, options={"model_type": "linear_reg"} + X_train, y_train, options={"model_type": "linear_reg"} ) # no data - report evaluation from the automatic data split @@ -85,22 +85,22 @@ def test_bqml_manual_preprocessing_e2e( session, dataset_id, penguins_df_default_index, new_penguins_df ): df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "culmen_length_mm", "culmen_depth_mm", "flipper_length_mm", ] ] - train_y = df[["body_mass_g"]] + y_train = df[["body_mass_g"]] transforms = [ bigframes.ml.sql.ml_standard_scaler(column, column) - for column in train_X.columns.tolist() + for column in X_train.columns.tolist() ] - transforms.extend(train_y.columns.tolist()) + transforms.extend(y_train.columns.tolist()) options = {"model_type": "linear_reg"} model = bigframes.ml.core.create_bqml_model( - train_X, train_y, transforms=transforms, options=options + X_train, y_train, transforms=transforms, options=options ) # no data - report evaluation from the automatic data split diff --git a/tests/system/large/ml/test_ensemble.py b/tests/system/large/ml/test_ensemble.py index 88c5ccd2f0..9b2872d673 100644 --- a/tests/system/large/ml/test_ensemble.py +++ b/tests/system/large/ml/test_ensemble.py @@ -25,7 +25,7 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBRegressor() df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -35,11 +35,11 @@ def test_xgbregressor_default_params(penguins_df_default_index, dataset_id): "sex", ] ] - train_y = df[["body_mass_g"]] - model.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [97.368139], @@ -86,7 +86,7 @@ def test_xgbregressor_dart_booster_multiple_params( ) df = penguins_df_default_index.dropna().sample(n=70) - train_X = df[ + X_train = df[ [ "species", "island", @@ -96,11 +96,11 @@ def test_xgbregressor_dart_booster_multiple_params( "sex", ] ] - train_y = df[["body_mass_g"]] - model.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "mean_absolute_error", @@ -144,7 +144,7 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): model = bigframes.ml.ensemble.XGBClassifier() df = penguins_df_default_index.dropna().sample(n=70) - train_X = df[ + X_train = df[ [ "species", "island", @@ -153,11 +153,11 @@ def test_xgbclassifier_default_params(penguins_df_default_index, dataset_id): "flipper_length_mm", ] ] - train_y = df[["sex"]] - model.fit(train_X, train_y) + y_train = df[["sex"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -201,7 +201,7 @@ def test_xgbclassifier_dart_booster_multiple_params( ) df = penguins_df_default_index.dropna().sample(n=70) - train_X = df[ + X_train = df[ [ "species", "island", @@ -210,11 +210,11 @@ def test_xgbclassifier_dart_booster_multiple_params( "flipper_length_mm", ] ] - train_y = df[["sex"]] - model.fit(train_X, train_y) + y_train = df[["sex"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -258,7 +258,7 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset model = bigframes.ml.ensemble.RandomForestRegressor() df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -268,11 +268,11 @@ def test_randomforestregressor_default_params(penguins_df_default_index, dataset "sex", ] ] - train_y = df[["body_mass_g"]] - model.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "mean_absolute_error", @@ -311,7 +311,7 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase ) df = penguins_df_default_index.dropna().sample(n=70) - train_X = df[ + X_train = df[ [ "species", "island", @@ -321,11 +321,11 @@ def test_randomforestregressor_multiple_params(penguins_df_default_index, datase "sex", ] ] - train_y = df[["body_mass_g"]] - model.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "mean_absolute_error", @@ -366,7 +366,7 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase model = bigframes.ml.ensemble.RandomForestClassifier() df = penguins_df_default_index.dropna().sample(n=70) - train_X = df[ + X_train = df[ [ "species", "island", @@ -375,11 +375,11 @@ def test_randomforestclassifier_default_params(penguins_df_default_index, datase "flipper_length_mm", ] ] - train_y = df[["sex"]] - model.fit(train_X, train_y) + y_train = df[["sex"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -418,7 +418,7 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas ) df = penguins_df_default_index.dropna().sample(n=70) - train_X = df[ + X_train = df[ [ "species", "island", @@ -427,11 +427,11 @@ def test_randomforestclassifier_multiple_params(penguins_df_default_index, datas "flipper_length_mm", ] ] - train_y = df[["sex"]] - model.fit(train_X, train_y) + y_train = df[["sex"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", diff --git a/tests/system/large/ml/test_forecasting.py b/tests/system/large/ml/test_forecasting.py index d1e2d12296..33b835e852 100644 --- a/tests/system/large/ml/test_forecasting.py +++ b/tests/system/large/ml/test_forecasting.py @@ -21,9 +21,9 @@ def test_arima_plus_model_fit_score( time_series_df_default_index, dataset_id, new_time_series_df ): model = forecasting.ARIMAPlus() - train_X = time_series_df_default_index[["parsed_date"]] - train_y = time_series_df_default_index[["total_visits"]] - model.fit(train_X, train_y) + X_train = time_series_df_default_index[["parsed_date"]] + y_train = time_series_df_default_index[["total_visits"]] + model.fit(X_train, y_train) result = model.score( new_time_series_df[["parsed_date"]], new_time_series_df[["total_visits"]] diff --git a/tests/system/large/ml/test_linear_model.py b/tests/system/large/ml/test_linear_model.py index 332b460fe5..3b90568450 100644 --- a/tests/system/large/ml/test_linear_model.py +++ b/tests/system/large/ml/test_linear_model.py @@ -21,7 +21,7 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase model = bigframes.ml.linear_model.LinearRegression(fit_intercept=False) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -31,11 +31,11 @@ def test_linear_regression_configure_fit_score(penguins_df_default_index, datase "sex", ] ] - train_y = df[["body_mass_g"]] - model.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [225.735767], @@ -66,7 +66,7 @@ def test_linear_regression_manual_split_configure_fit_score( model = bigframes.ml.linear_model.LinearRegression(fit_intercept=True) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -76,11 +76,11 @@ def test_linear_regression_manual_split_configure_fit_score( "sex", ] ] - train_y = df[["body_mass_g"]] - model.fit(train_X, train_y) + y_train = df[["body_mass_g"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { "mean_absolute_error": [225.735767], @@ -108,7 +108,7 @@ def test_logistic_regression_auto_class_weights_configure_fit_score( ): model = bigframes.ml.linear_model.LogisticRegression() df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -117,11 +117,11 @@ def test_logistic_regression_auto_class_weights_configure_fit_score( "flipper_length_mm", ] ] - train_y = df[["sex"]] - model.fit(train_X, train_y) + y_train = df[["sex"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { "precision": [0.58085], @@ -155,7 +155,7 @@ def test_logistic_regression_manual_split_configure_fit_score( model = bigframes.ml.linear_model.LogisticRegression(fit_intercept=True) df = penguins_df_default_index.dropna() - train_X = df[ + X_train = df[ [ "species", "island", @@ -165,11 +165,11 @@ def test_logistic_regression_manual_split_configure_fit_score( "body_mass_g", ] ] - train_y = df[["sex"]] - model.fit(train_X, train_y) + y_train = df[["sex"]] + model.fit(X_train, y_train) # Check score to ensure the model was fitted - result = model.score(train_X, train_y).to_pandas() + result = model.score(X_train, y_train).to_pandas() expected = pd.DataFrame( { "precision": [0.616753], diff --git a/tests/system/small/ml/conftest.py b/tests/system/small/ml/conftest.py index ebe768b685..9ca5a2fd0e 100644 --- a/tests/system/small/ml/conftest.py +++ b/tests/system/small/ml/conftest.py @@ -32,8 +32,8 @@ @pytest.fixture(scope="session") -def ml_connection() -> str: - return "bigframes-dev.us.bigframes-ml" +def bq_connection() -> str: + return "bigframes-dev.us.bigframes-rf-conn" @pytest.fixture(scope="session") @@ -198,33 +198,33 @@ def llm_text_df(session, llm_text_pandas_df): @pytest.fixture(scope="session") -def bqml_palm2_text_generator_model(session, ml_connection) -> core.BqmlModel: +def bqml_palm2_text_generator_model(session, bq_connection) -> core.BqmlModel: options = { "remote_service_type": "CLOUD_AI_LARGE_LANGUAGE_MODEL_V1", } return core.create_bqml_remote_model( - session=session, connection_name=ml_connection, options=options + session=session, connection_name=bq_connection, options=options ) @pytest.fixture(scope="session") -def palm2_text_generator_model(session, ml_connection) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator(session=session, connection_name=ml_connection) +def palm2_text_generator_model(session, bq_connection) -> llm.PaLM2TextGenerator: + return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) @pytest.fixture(scope="function") def ephemera_palm2_text_generator_model( - session, ml_connection + session, bq_connection ) -> llm.PaLM2TextGenerator: - return llm.PaLM2TextGenerator(session=session, connection_name=ml_connection) + return llm.PaLM2TextGenerator(session=session, connection_name=bq_connection) @pytest.fixture(scope="session") def palm2_embedding_generator_model( - session, ml_connection + session, bq_connection ) -> llm.PaLM2TextEmbeddingGenerator: return llm.PaLM2TextEmbeddingGenerator( - session=session, connection_name=ml_connection + session=session, connection_name=bq_connection ) @@ -247,10 +247,22 @@ def time_series_arima_plus_model( @pytest.fixture(scope="session") -def imported_tensorflow_model(session) -> imported.TensorFlowModel: +def imported_tensorflow_model_path() -> str: + return "gs://cloud-training-demos/txtclass/export/exporter/1549825580/*" + + +@pytest.fixture(scope="session") +def imported_onnx_model_path() -> str: + return "gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx" + + +@pytest.fixture(scope="session") +def imported_tensorflow_model( + session, imported_tensorflow_model_path +) -> imported.TensorFlowModel: return imported.TensorFlowModel( session=session, - model_path="gs://cloud-training-demos/txtclass/export/exporter/1549825580/*", + model_path=imported_tensorflow_model_path, ) @@ -263,8 +275,8 @@ def ephemera_imported_tensorflow_model(session) -> imported.TensorFlowModel: @pytest.fixture(scope="session") -def imported_onnx_model(session) -> imported.ONNXModel: +def imported_onnx_model(session, imported_onnx_model_path) -> imported.ONNXModel: return imported.ONNXModel( session=session, - model_path="gs://cloud-samples-data/bigquery/ml/onnx/pipeline_rf.onnx", + model_path=imported_onnx_model_path, ) diff --git a/tests/system/small/ml/test_ensemble.py b/tests/system/small/ml/test_ensemble.py index fde3cc431e..bba083d98d 100644 --- a/tests/system/small/ml/test_ensemble.py +++ b/tests/system/small/ml/test_ensemble.py @@ -25,7 +25,7 @@ def test_xgbregressor_model_score( penguins_xgbregressor_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -35,8 +35,8 @@ def test_xgbregressor_model_score( "body_mass_g", ] ] - test_y = df[["sex"]] - result = penguins_xgbregressor_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = penguins_xgbregressor_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [108.77582], @@ -62,7 +62,7 @@ def test_xgbregressor_model_score_series( penguins_xgbregressor_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -72,8 +72,8 @@ def test_xgbregressor_model_score_series( "body_mass_g", ] ] - test_y = df["sex"] - result = penguins_xgbregressor_model.score(test_X, test_y).to_pandas() + y_test = df["sex"] + result = penguins_xgbregressor_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [108.77582], @@ -120,7 +120,7 @@ def test_to_gbq_saved_xgbregressor_model_scores( f"{dataset_id}.test_penguins_model", replace=True ) df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -130,8 +130,8 @@ def test_to_gbq_saved_xgbregressor_model_scores( "body_mass_g", ] ] - test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = saved_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [109.016973], @@ -165,7 +165,7 @@ def test_xgbclassifier_model_score( penguins_xgbclassifier_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -175,8 +175,8 @@ def test_xgbclassifier_model_score( "body_mass_g", ] ] - test_y = df[["sex"]] - result = penguins_xgbclassifier_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = penguins_xgbclassifier_model.score(X_test, y_test).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -193,7 +193,7 @@ def test_xgbclassifier_model_score_series( penguins_xgbclassifier_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -203,8 +203,8 @@ def test_xgbclassifier_model_score_series( "body_mass_g", ] ] - test_y = df["sex"] - result = penguins_xgbclassifier_model.score(test_X, test_y).to_pandas() + y_test = df["sex"] + result = penguins_xgbclassifier_model.score(X_test, y_test).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -242,7 +242,7 @@ def test_to_gbq_saved_xgbclassifier_model_scores( f"{dataset_id}.test_penguins_model", replace=True ) df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -252,8 +252,8 @@ def test_to_gbq_saved_xgbclassifier_model_scores( "body_mass_g", ] ] - test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = saved_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "precision": [1.0], @@ -289,7 +289,7 @@ def test_randomforestregressor_model_score( penguins_randomforest_regressor_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -299,8 +299,8 @@ def test_randomforestregressor_model_score( "body_mass_g", ] ] - test_y = df[["sex"]] - result = penguins_randomforest_regressor_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = penguins_randomforest_regressor_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [317.031042], @@ -326,7 +326,7 @@ def test_randomforestregressor_model_score_series( penguins_randomforest_regressor_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -336,8 +336,8 @@ def test_randomforestregressor_model_score_series( "body_mass_g", ] ] - test_y = df["sex"] - result = penguins_randomforest_regressor_model.score(test_X, test_y).to_pandas() + y_test = df["sex"] + result = penguins_randomforest_regressor_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [317.031042], @@ -385,7 +385,7 @@ def test_to_gbq_saved_randomforestregressor_model_scores( f"{dataset_id}.test_penguins_model", replace=True ) df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -395,8 +395,8 @@ def test_to_gbq_saved_randomforestregressor_model_scores( "body_mass_g", ] ] - test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = saved_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [319.239235], @@ -434,7 +434,7 @@ def test_randomforestclassifier_model_score( penguins_randomforest_classifier_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -444,8 +444,8 @@ def test_randomforestclassifier_model_score( "body_mass_g", ] ] - test_y = df[["sex"]] - result = penguins_randomforest_classifier_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = penguins_randomforest_classifier_model.score(X_test, y_test).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -462,7 +462,7 @@ def test_randomforestclassifier_model_score_series( penguins_randomforest_classifier_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -472,8 +472,8 @@ def test_randomforestclassifier_model_score_series( "body_mass_g", ] ] - test_y = df["sex"] - result = penguins_randomforest_classifier_model.score(test_X, test_y).to_pandas() + y_test = df["sex"] + result = penguins_randomforest_classifier_model.score(X_test, y_test).to_pandas() TestCase().assertSequenceEqual(result.shape, (1, 6)) for col_name in [ "precision", @@ -512,7 +512,7 @@ def test_to_gbq_saved_randomforestclassifier_model_scores( f"{dataset_id}.test_penguins_model", replace=True ) df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -522,8 +522,8 @@ def test_to_gbq_saved_randomforestclassifier_model_scores( "body_mass_g", ] ] - test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = saved_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "precision": [0.636746], diff --git a/tests/system/small/ml/test_imported.py b/tests/system/small/ml/test_imported.py index 6274ab1245..d305567066 100644 --- a/tests/system/small/ml/test_imported.py +++ b/tests/system/small/ml/test_imported.py @@ -25,6 +25,11 @@ def test_tensorflow_create_model(imported_tensorflow_model): assert imported_tensorflow_model is not None +def test_tensorflow_create_model_default_session(imported_tensorflow_model_path): + model = imported.TensorFlowModel(model_path=imported_tensorflow_model_path) + assert model is not None + + def test_tensorflow_model_predict(imported_tensorflow_model, llm_text_df): df = llm_text_df.rename(columns={"prompt": "input"}) result = imported_tensorflow_model.predict(df).to_pandas() @@ -61,6 +66,11 @@ def test_onnx_create_model(imported_onnx_model): assert imported_onnx_model is not None +def test_onnx_create_model_default_session(imported_onnx_model_path): + model = imported.TensorFlowModel(model_path=imported_onnx_model_path) + assert model is not None + + def test_onnx_model_predict(imported_onnx_model, onnx_iris_df): result = imported_onnx_model.predict(onnx_iris_df).to_pandas() value1 = np.array([0.9999993443489075, 0.0, 0.0]) diff --git a/tests/system/small/ml/test_linear_model.py b/tests/system/small/ml/test_linear_model.py index bbb7e2820c..3a8232ed9e 100644 --- a/tests/system/small/ml/test_linear_model.py +++ b/tests/system/small/ml/test_linear_model.py @@ -19,7 +19,7 @@ def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -29,8 +29,8 @@ def test_linear_reg_model_score(penguins_linear_model, penguins_df_default_index "sex", ] ] - test_y = df[["body_mass_g"]] - result = penguins_linear_model.score(test_X, test_y).to_pandas() + y_test = df[["body_mass_g"]] + result = penguins_linear_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [225.817334], @@ -56,7 +56,7 @@ def test_linear_reg_model_score_series( penguins_linear_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -66,8 +66,8 @@ def test_linear_reg_model_score_series( "sex", ] ] - test_y = df["body_mass_g"] - result = penguins_linear_model.score(test_X, test_y).to_pandas() + y_test = df["body_mass_g"] + result = penguins_linear_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [225.817334], @@ -111,7 +111,7 @@ def test_to_gbq_saved_linear_reg_model_scores( f"{dataset_id}.test_penguins_model", replace=True ) df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -121,8 +121,8 @@ def test_to_gbq_saved_linear_reg_model_scores( "sex", ] ] - test_y = df[["body_mass_g"]] - result = saved_model.score(test_X, test_y).to_pandas() + y_test = df[["body_mass_g"]] + result = saved_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "mean_absolute_error": [227.01223], @@ -152,7 +152,7 @@ def test_to_gbq_replace(penguins_linear_model, dataset_id): def test_logistic_model_score(penguins_logistic_model, penguins_df_default_index): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -162,8 +162,8 @@ def test_logistic_model_score(penguins_logistic_model, penguins_df_default_index "body_mass_g", ] ] - test_y = df[["sex"]] - result = penguins_logistic_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = penguins_logistic_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "precision": [0.616753], @@ -189,7 +189,7 @@ def test_logistic_model_score_series( penguins_logistic_model, penguins_df_default_index ): df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -199,8 +199,8 @@ def test_logistic_model_score_series( "body_mass_g", ] ] - test_y = df["sex"] - result = penguins_logistic_model.score(test_X, test_y).to_pandas() + y_test = df["sex"] + result = penguins_logistic_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "precision": [0.616753], @@ -244,7 +244,7 @@ def test_logsitic_model_to_gbq_saved_score( f"{dataset_id}.test_penguins_model", replace=True ) df = penguins_df_default_index.dropna() - test_X = df[ + X_test = df[ [ "species", "island", @@ -254,8 +254,8 @@ def test_logsitic_model_to_gbq_saved_score( "body_mass_g", ] ] - test_y = df[["sex"]] - result = saved_model.score(test_X, test_y).to_pandas() + y_test = df[["sex"]] + result = saved_model.score(X_test, y_test).to_pandas() expected = pandas.DataFrame( { "precision": [0.616753], diff --git a/tests/system/small/ml/test_llm.py b/tests/system/small/ml/test_llm.py index 181678ebcb..7486277487 100644 --- a/tests/system/small/ml/test_llm.py +++ b/tests/system/small/ml/test_llm.py @@ -17,10 +17,25 @@ import numpy as np import pytest +from bigframes.ml import llm + def test_create_text_generator_model(palm2_text_generator_model): # Model creation doesn't return error assert palm2_text_generator_model is not None + assert palm2_text_generator_model._bqml_model is not None + + +def test_create_text_generator_model_defaults(bq_connection): + import bigframes.pandas as bpd + + bpd.reset_session() + bpd.options.bigquery.bq_connection = bq_connection + bpd.options.bigquery.location = "us" + + model = llm.PaLM2TextGenerator() + assert model is not None + assert model._bqml_model is not None # Marked as flaky only because BQML LLM is in preview, the service only has limited capacity, not stable enough. @@ -74,6 +89,19 @@ def test_text_generator_predict_with_params_success( def test_create_embedding_generator_model(palm2_embedding_generator_model): # Model creation doesn't return error assert palm2_embedding_generator_model is not None + assert palm2_embedding_generator_model._bqml_model is not None + + +def test_create_text_embedding_generator_model_defaults(bq_connection): + import bigframes.pandas as bpd + + bpd.reset_session() + bpd.options.bigquery.bq_connection = bq_connection + bpd.options.bigquery.location = "us" + + model = llm.PaLM2TextEmbeddingGenerator() + assert model is not None + assert model._bqml_model is not None @pytest.mark.flaky(retries=2, delay=120) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 01305adb20..85c3cce1d7 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -551,13 +551,69 @@ def test_assign_callable_lambda(scalars_dfs): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) -def test_dropna(scalars_dfs): +@pytest.mark.parametrize( + ("axis", "how", "ignore_index"), + [ + (0, "any", False), + (0, "any", True), + (1, "any", False), + (1, "all", False), + ], +) +def test_df_dropna(scalars_dfs, axis, how, ignore_index): + if pd.__version__.startswith("1."): + pytest.skip("ignore_index parameter not supported in pandas 1.x.") scalars_df, scalars_pandas_df = scalars_dfs - df = scalars_df.dropna() + df = scalars_df.dropna(axis=axis, how=how, ignore_index=ignore_index) bf_result = df.to_pandas() - pd_result = scalars_pandas_df.dropna() + pd_result = scalars_pandas_df.dropna(axis=axis, how=how, ignore_index=ignore_index) - assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_fillna(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + df = scalars_df[["int64_col", "float64_col"]].fillna(3) + bf_result = df.to_pandas() + pd_result = scalars_pandas_df[["int64_col", "float64_col"]].fillna(3) + + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_df_isin_list(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, pd.NA, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + +def test_df_isin_dict(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = { + "string_col": ["Hello, World!", 55555, 2.51, pd.NA, True], + "int64_col": [5555, 2.51], + "bool_col": [pd.NA], + } + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) @pytest.mark.parametrize( @@ -1084,50 +1140,43 @@ def test_series_binop_axis_index( @pytest.mark.parametrize( - ("op"), + ("left_labels", "right_labels"), [ - (lambda x, y: x.add(y, axis="index")), - (lambda x, y: x.radd(y, axis="index")), - (lambda x, y: x.sub(y, axis="index")), - (lambda x, y: x.rsub(y, axis="index")), - (lambda x, y: x.mul(y, axis="index")), - (lambda x, y: x.rmul(y, axis="index")), - (lambda x, y: x.truediv(y, axis="index")), - (lambda x, y: x.rtruediv(y, axis="index")), - (lambda x, y: x.floordiv(y, axis="index")), - (lambda x, y: x.floordiv(y, axis="index")), - (lambda x, y: x.gt(y, axis="index")), - (lambda x, y: x.ge(y, axis="index")), - (lambda x, y: x.lt(y, axis="index")), - (lambda x, y: x.le(y, axis="index")), + (["a", "a", "b"], ["c", "c", "d"]), + (["a", "b", "c"], ["c", "a", "b"]), + (["a", "c", "c"], ["c", "a", "c"]), ], ids=[ - "add", - "radd", - "sub", - "rsub", - "mul", - "rmul", - "truediv", - "rtruediv", - "floordiv", - "rfloordiv", - "gt", - "ge", - "lt", - "le", + "no_overlap", + "one_one_match", + "multi_match", ], ) -def test_dataframe_binop_axis_index_throws_not_implemented( - scalars_dfs, - op, +def test_binop_df_df_binary_op( + scalars_df_index, + scalars_df_2_index, + scalars_pandas_df_index, + left_labels, + right_labels, ): - scalars_df, scalars_pandas_df = scalars_dfs - df_columns = ["int64_col", "float64_col"] - other_df_columns = ["int64_too"] - - with pytest.raises(NotImplementedError): - op(scalars_df[df_columns], scalars_df[other_df_columns]).to_pandas() + if pd.__version__.startswith("1."): + pytest.skip("pd.NA vs NaN not handled well in pandas 1.x.") + columns = ["int64_too", "int64_col", "float64_col"] + + bf_df_a = scalars_df_index[columns] + bf_df_a.columns = left_labels + bf_df_b = scalars_df_2_index[columns] + bf_df_b.columns = right_labels + bf_result = (bf_df_a - bf_df_b).to_pandas() + + pd_df_a = scalars_pandas_df_index[columns] + pd_df_a.columns = left_labels + pd_df_b = scalars_pandas_df_index[columns] + pd_df_b.columns = right_labels + pd_result = pd_df_a - pd_df_b + + # Some dtype inconsistency for all-NULL columns + pd.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) # Differnt table will only work for explicit index, since default index orders are arbitrary. diff --git a/tests/system/small/test_index.py b/tests/system/small/test_index.py index ac1f8c7220..558dd12e69 100644 --- a/tests/system/small/test_index.py +++ b/tests/system/small/test_index.py @@ -50,3 +50,17 @@ def test_index_getitem_int(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.index[-2] pd_result = scalars_pandas_df_index.index[-2] assert bf_result == pd_result + + +def test_is_monotonic_increasing(scalars_df_index, scalars_pandas_df_index): + assert ( + scalars_df_index.index.is_monotonic_increasing + == scalars_pandas_df_index.index.is_monotonic_increasing + ) + + +def test_is_monotonic_decreasing(scalars_df_index, scalars_pandas_df_index): + assert ( + scalars_df_index.index.is_monotonic_increasing + == scalars_pandas_df_index.index.is_monotonic_increasing + ) diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 1baf3e6650..25d1e2ad49 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -634,12 +634,17 @@ def test_column_multi_index_stack(scalars_df_index, scalars_pandas_df_index): pd_df.columns = multi_columns bf_result = bf_df.stack().to_pandas() + # Shifting sort behavior in stack pd_result = pd_df.stack() # Pandas produces NaN, where bq dataframes produces pd.NA - pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + # Column ordering seems to depend on pandas version + pandas.testing.assert_frame_equal( + bf_result.sort_index(axis=1), pd_result.sort_index(axis=1), check_dtype=False + ) +@pytest.mark.skip(reason="Pandas fails in newer versions.") def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index): columns = ["int64_too", "int64_col", "rowindex_2"] level1 = pandas.Index(["b", pandas.NA, pandas.NA]) @@ -656,3 +661,64 @@ def test_column_multi_index_w_na_stack(scalars_df_index, scalars_pandas_df_index # Pandas produces NaN, where bq dataframes produces pd.NA pandas.testing.assert_frame_equal(bf_result, pd_result, check_dtype=False) + + +@pytest.mark.parametrize( + ("index_names",), + [ + (["rowindex_2", "int64_too"],), + (["int64_too", "rowindex_2"],), + ], +) +def test_is_monotonic_increasing( + scalars_df_index, scalars_pandas_df_index, index_names +): + bf_result = scalars_df_index.set_index(index_names).index + pd_result = scalars_pandas_df_index.set_index(index_names).index + + assert bf_result.is_monotonic_increasing == pd_result.is_monotonic_increasing + + +@pytest.mark.parametrize( + ("indexes",), + [ + ({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3]},), + ({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, None, 3]},), + ({"A": [1, 2, 2], "B": [1, 2, 1], "C": [1, 2, 3]},), + ({"A": [1, 2, 2], "B": [1, 2, 3], "C": [1, 2, 1]},), + ({"A": [1, 2, 1], "B": [1, 2, 3], "C": [1, 2, 1]},), + ({"A": [3, 2, 1], "B": [3, 2, 1], "C": [2, 2, 1]},), + ], +) +def test_is_monotonic_increasing_extra(indexes): + bf_result = bpd.DataFrame(indexes) + bf_result = bf_result.set_index(["A", "B", "C"]) + pd_result = pandas.DataFrame(indexes) + pd_result = pd_result.set_index(["A", "B", "C"]) + + assert ( + bf_result.index.is_monotonic_increasing + == pd_result.index.is_monotonic_increasing + ) + + +@pytest.mark.parametrize( + ("indexes",), + [ + ({"A": [3, 2, 1], "B": [3, 2, 1], "C": [3, 2, 1]},), + ({"A": [3, 2, 1], "B": [3, 2, 1], "C": [3, None, 1]},), + ({"A": [2, 2, 1], "B": [1, 2, 1], "C": [3, 2, 1]},), + ({"A": [2, 2, 1], "B": [3, 2, 1], "C": [1, 2, 1]},), + ({"A": [1, 2, 1], "B": [3, 2, 1], "C": [1, 2, 1]},), + ], +) +def test_is_monotonic_decreasing_extra(indexes): + bf_result = bpd.DataFrame(indexes) + bf_result = bf_result.set_index(["A", "B", "C"]) + pd_result = pandas.DataFrame(indexes) + pd_result = pd_result.set_index(["A", "B", "C"]) + + assert ( + bf_result.index.is_monotonic_decreasing + == pd_result.index.is_monotonic_decreasing + ) diff --git a/tests/system/small/test_numpy.py b/tests/system/small/test_numpy.py index fff689caba..5c2a93ec39 100644 --- a/tests/system/small/test_numpy.py +++ b/tests/system/small/test_numpy.py @@ -67,3 +67,69 @@ def test_df_ufuncs(scalars_dfs, opname): pd_result = getattr(np, opname)(scalars_pandas_df[["float64_col", "int64_col"]]) pd.testing.assert_frame_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("opname",), + [ + ("add",), + ("subtract",), + ("multiply",), + ("divide",), + ("power",), + ], +) +def test_series_binary_ufuncs(floats_product_pd, floats_product_bf, opname): + bf_result = getattr(np, opname)( + floats_product_bf.float64_col_x, floats_product_bf.float64_col_y + ).to_pandas() + pd_result = getattr(np, opname)( + floats_product_pd.float64_col_x, floats_product_pd.float64_col_y + ) + pd.testing.assert_series_equal(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("opname",), + [ + ("add",), + ("subtract",), + ("multiply",), + ("divide",), + ("power",), + ], +) +def test_df_binary_ufuncs(scalars_dfs, opname): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = getattr(np, opname)( + scalars_df[["float64_col", "int64_col"]], 5.1 + ).to_pandas() + pd_result = getattr(np, opname)( + scalars_pandas_df[["float64_col", "int64_col"]], 5.1 + ) + + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_series_binary_ufuncs_reverse(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # Could be any non-symmetric binary op + bf_result = np.subtract(5.1, scalars_df["int64_col"]).to_pandas() + pd_result = np.subtract(5.1, scalars_pandas_df["int64_col"]) + + pd.testing.assert_series_equal(bf_result, pd_result) + + +def test_df_binary_ufuncs_reverse(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + # Could be any non-symmetric binary op + bf_result = np.subtract(5.1, scalars_df[["float64_col", "int64_col"]]).to_pandas() + pd_result = np.subtract( + 5.1, + scalars_pandas_df[["float64_col", "int64_col"]], + ) + + pd.testing.assert_frame_equal(bf_result, pd_result) diff --git a/tests/system/small/test_pandas.py b/tests/system/small/test_pandas.py index 98bafc6392..e451d5c3a2 100644 --- a/tests/system/small/test_pandas.py +++ b/tests/system/small/test_pandas.py @@ -16,6 +16,7 @@ import pytest import bigframes.pandas as bpd +from tests.system.utils import assert_pandas_df_equal_ignore_ordering def test_concat_dataframe(scalars_dfs): @@ -105,3 +106,106 @@ def test_concat_axis_1(scalars_dfs, how): pd_result = pd.concat([pd_part1, pd_part2, pd_part3], join=how, axis=1) pd.testing.assert_frame_equal(bf_result.to_pandas(), pd_result) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + on = "rowindex_2" + left_columns = ["int64_col", "float64_col", "rowindex_2"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + # Offset the rows somewhat so that outer join can have an effect. + right = scalars_df[right_columns].assign(rowindex_2=scalars_df["rowindex_2"] + 2) + + df = bpd.merge(left, right, merge_how, on, sort=True) + bf_result = df.to_pandas() + + pd_result = pd.merge( + scalars_pandas_df[left_columns], + scalars_pandas_df[right_columns].assign( + rowindex_2=scalars_pandas_df["rowindex_2"] + 2 + ), + merge_how, + on, + sort=True, + ) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_left_on_right_on(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_columns = ["int64_col", "float64_col", "int64_too"] + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_columns] + right = scalars_df[right_columns] + + df = bpd.merge( + left, right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True + ) + bf_result = df.to_pandas() + + pd_result = pd.merge( + scalars_pandas_df[left_columns], + scalars_pandas_df[right_columns], + merge_how, + left_on="int64_too", + right_on="rowindex_2", + sort=True, + ) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) + + +@pytest.mark.parametrize( + ("merge_how",), + [ + ("inner",), + ("outer",), + ("left",), + ("right",), + ], +) +def test_merge_series(scalars_dfs, merge_how): + scalars_df, scalars_pandas_df = scalars_dfs + left_column = "int64_too" + right_columns = ["int64_col", "bool_col", "string_col", "rowindex_2"] + + left = scalars_df[left_column] + right = scalars_df[right_columns] + + df = bpd.merge( + left, right, merge_how, left_on="int64_too", right_on="rowindex_2", sort=True + ) + bf_result = df.to_pandas() + + pd_result = pd.merge( + scalars_pandas_df[left_column], + scalars_pandas_df[right_columns], + merge_how, + left_on="int64_too", + right_on="rowindex_2", + sort=True, + ) + + assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) diff --git a/tests/system/small/test_pandas_options.py b/tests/system/small/test_pandas_options.py index 9a3d55aed2..6510c4fa27 100644 --- a/tests/system/small/test_pandas_options.py +++ b/tests/system/small/test_pandas_options.py @@ -254,49 +254,6 @@ def test_read_gbq_must_comply_with_set_location_non_US( assert df is not None -def test_reset_session_after_bq_session_ended(): - # Use a simple test query to verify that default session works to interact - # with BQ - test_query = "SELECT 1" - - # Confirm that there is a session id in the default session - session = bpd.get_global_session() - assert session._session_id - - # Confirm that session works as usual - df = bpd.read_gbq(test_query) - assert df is not None - - # Abort the session to simulate the auto-expiration - # https://cloud.google.com/bigquery/docs/sessions-terminating#auto-terminate_a_session - abort_session_query = "CALL BQ.ABORT_SESSION()" - query_job = session.bqclient.query(abort_session_query) - query_job.result() # blocks until finished - - # Confirm that session is unusable to run any jobs - with pytest.raises( - google.api_core.exceptions.BadRequest, - match=f"Session {session._session_id} has expired and is no longer available.", - ): - query_job = session.bqclient.query(test_query) - query_job.result() # blocks until finished - - # Confirm that as a result bigframes.pandas interface is unusable - with pytest.raises( - google.api_core.exceptions.BadRequest, - match=f"Session {session._session_id} has expired and is no longer available.", - ): - bpd.read_gbq(test_query) - - # Now try to reset session and verify that it works - bpd.reset_session() - assert bigframes.core.global_session._global_session is None - - # Now verify that use is able to start over - df = bpd.read_gbq(test_query) - assert df is not None - - def test_reset_session_after_credentials_need_reauthentication(monkeypatch): # Use a simple test query to verify that default session works to interact # with BQ diff --git a/tests/system/small/test_remote_function.py b/tests/system/small/test_remote_function.py index c60d270fca..77fb81d2c9 100644 --- a/tests/system/small/test_remote_function.py +++ b/tests/system/small/test_remote_function.py @@ -65,9 +65,7 @@ def bq_cf_connection_location_project_mismatched() -> str: @pytest.fixture(scope="module") def session_with_bq_connection(bq_cf_connection) -> bigframes.Session: - return bigframes.Session( - bigframes.BigQueryOptions(remote_udf_connection=bq_cf_connection) - ) + return bigframes.Session(bigframes.BigQueryOptions(bq_connection=bq_cf_connection)) @pytest.fixture(scope="module") @@ -75,7 +73,7 @@ def session_with_bq_connection_location_specified( bq_cf_connection_location, ) -> bigframes.Session: return bigframes.Session( - bigframes.BigQueryOptions(remote_udf_connection=bq_cf_connection_location) + bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location) ) @@ -84,9 +82,7 @@ def session_with_bq_connection_location_mistached( bq_cf_connection_location_mistached, ) -> bigframes.Session: return bigframes.Session( - bigframes.BigQueryOptions( - remote_udf_connection=bq_cf_connection_location_mistached - ) + bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_mistached) ) @@ -95,9 +91,7 @@ def session_with_bq_connection_location_project_specified( bq_cf_connection_location_project, ) -> bigframes.Session: return bigframes.Session( - bigframes.BigQueryOptions( - remote_udf_connection=bq_cf_connection_location_project - ) + bigframes.BigQueryOptions(bq_connection=bq_cf_connection_location_project) ) @@ -432,7 +426,7 @@ def test_remote_function_via_session_context_connection_setter( # Creating a session scoped only to this test as we would be setting a # property in it context = bigframes.BigQueryOptions() - context.remote_udf_connection = bq_cf_connection + context.bq_connection = bq_cf_connection session = bigframes.connect(context) # Without an explicit bigquery connection, the one present in Session, @@ -523,6 +517,25 @@ def add_one(x): assert_pandas_df_equal_ignore_ordering(bf_result, pd_result) +@pytest.mark.flaky(retries=2, delay=120) +def test_series_map(session_with_bq_connection, scalars_dfs): + def add_one(x): + return x + 1 + + remote_add_one = session_with_bq_connection.remote_function([int], int)(add_one) + + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = scalars_df.int64_too.map(remote_add_one).to_pandas() + pd_result = scalars_pandas_df.int64_too.map(add_one) + pd_result = pd_result.astype("Int64") # pandas type differences + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + @pytest.mark.flaky(retries=2, delay=120) def test_read_gbq_function_detects_invalid_function(bigquery_client, dataset_id): dataset_ref = bigquery.DatasetReference.from_string(dataset_id) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 88ad2245c9..07dc892ddc 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -186,6 +186,23 @@ def test_fillna(scalars_dfs): ) +@pytest.mark.parametrize( + ("ignore_index",), + ( + (True,), + (False,), + ), +) +def test_series_dropna(scalars_dfs, ignore_index): + if pd.__version__.startswith("1."): + pytest.skip("ignore_index parameter not supported in pandas 1.x.") + scalars_df, scalars_pandas_df = scalars_dfs + col_name = "string_col" + bf_result = scalars_df[col_name].dropna(ignore_index=ignore_index).to_pandas() + pd_result = scalars_pandas_df[col_name].dropna(ignore_index=ignore_index) + pd.testing.assert_series_equal(pd_result, bf_result, check_index_type=False) + + def test_series_agg_single_string(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs bf_result = scalars_df["int64_col"].agg("sum") @@ -365,6 +382,24 @@ def test_series_int_int_operators_scalar( assert_series_equal_ignoring_order(pd_result, bf_result) +def test_series_pow_scalar(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (scalars_df["int64_col"] ** 2).to_pandas() + pd_result = scalars_pandas_df["int64_col"] ** 2 + + assert_series_equal_ignoring_order(pd_result, bf_result) + + +def test_series_pow_scalar_reverse(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + bf_result = (0.8 ** scalars_df["int64_col"]).to_pandas() + pd_result = 0.8 ** scalars_pandas_df["int64_col"] + + assert_series_equal_ignoring_order(pd_result, bf_result) + + @pytest.mark.parametrize( ("operator"), [ @@ -2463,3 +2498,57 @@ def test_is_monotonic_decreasing(series_input): assert ( scalars_df.is_monotonic_decreasing == scalars_pandas_df.is_monotonic_decreasing ) + + +def test_map_dict_input(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + local_map = dict() + # construct a local map, incomplete to cover behavior + for s in scalars_pandas_df.string_col[:-3]: + if isinstance(s, str): + local_map[s] = ord(s[0]) + + pd_result = scalars_pandas_df.string_col.map(local_map) + pd_result = pd_result.astype("Int64") # pandas type differences + bf_result = scalars_df.string_col.map(local_map) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_map_series_input(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + new_index = scalars_pandas_df.int64_too.drop_duplicates() + pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] + pd_map_series.index = new_index + bf_map_series = series.Series( + pd_map_series, session=scalars_df._get_block().expr._session + ) + + pd_result = scalars_pandas_df.int64_too.map(pd_map_series) + bf_result = scalars_df.int64_too.map(bf_map_series) + + pd.testing.assert_series_equal( + bf_result.to_pandas(), + pd_result, + ) + + +def test_map_series_input_duplicates_error(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + + new_index = scalars_pandas_df.int64_too + pd_map_series = scalars_pandas_df.string_col.iloc[0 : len(new_index)] + pd_map_series.index = new_index + bf_map_series = series.Series( + pd_map_series, session=scalars_df._get_block().expr._session + ) + + with pytest.raises(pd.errors.InvalidIndexError): + scalars_pandas_df.int64_too.map(pd_map_series) + with pytest.raises(pd.errors.InvalidIndexError): + scalars_df.int64_too.map(bf_map_series, verify_integrity=True) diff --git a/tests/unit/_config/test_bigquery_options.py b/tests/unit/_config/test_bigquery_options.py index 43b5663bf7..aeee058319 100644 --- a/tests/unit/_config/test_bigquery_options.py +++ b/tests/unit/_config/test_bigquery_options.py @@ -26,7 +26,7 @@ ("credentials", object(), object()), ("location", "us-east1", "us-central1"), ("project", "my-project", "my-other-project"), - ("remote_udf_connection", "path/to/connection/1", "path/to/connection/2"), + ("bq_connection", "path/to/connection/1", "path/to/connection/2"), ], ) def test_setter_raises_if_session_started(attribute, original_value, new_value): @@ -56,7 +56,7 @@ def test_setter_raises_if_session_started(attribute, original_value, new_value): "credentials", "location", "project", - "remote_udf_connection", + "bq_connection", ] ], ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py deleted file mode 100644 index dcf2d918a5..0000000000 --- a/tests/unit/conftest.py +++ /dev/null @@ -1,223 +0,0 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from typing import Callable, Optional, Tuple, Union -from unittest import mock - -import google.api_core.exceptions -import google.auth -import google.cloud.bigquery as bigquery -import google.cloud.bigquery.table -import google.oauth2.credentials # type: ignore -import ibis.expr.types as ibis_types -import pandas -import pytest - -import bigframes -import bigframes.core -import bigframes.dataframe - -SCALARS_TABLE_ID = "project.dataset.scalars_table" - - -@pytest.fixture -def scalars_pandas_df_default_index() -> pandas.DataFrame: - # Note: as of 2023-02-07, using nullable dtypes with the ibis pandas - # backend requires running ibis at HEAD. See: - # https://github.com/ibis-project/ibis/pull/5345 - return pandas.DataFrame( - { - "rowindex": pandas.Series( - [ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - 9, - ], - dtype="Int64", - ), - "bool_col": pandas.Series( - [ - True, - None, - False, - True, - None, - False, - True, - None, - False, - True, - ], - dtype="boolean", - ), - "int64_col": pandas.Series( - [ - 1, - 2, - 3, - None, - 0, - -1, - -2, - 2**63 - 1, - -(2**63), - None, - ], - dtype="Int64", - ), - "float64_col": pandas.Series( - [ - None, - 1, - math.pi, - math.e * 1e10, - 0, - float("nan"), - float("inf"), - float("-inf"), - -2.23e-308, - 1.8e308, - ], - dtype="Float64", - ), - "string_col": pandas.Series( - [ - "abc", - "XYZ", - "aBcDeFgHiJkLmNoPqRsTuVwXyZ", - "1_2-3+4=5~6*7/8&9%10#11@12$" "", - None, - "こんにちは", - "你好", - "வணக்கம்", - "שלום", - ], - dtype="string[pyarrow]", - ), - } - ) - - -# We parameterize the fixtures at this point with the real pandas -# dataframes and deferred bigframes dataframes as we have the following -# chain of dependencies: -# -> index/default_index parameterization -# -> pandas dataframe -# -> bqclient mock -# -> session -# -> bigframes dataframe -@pytest.fixture -def scalars_testdata_setup( - scalars_pandas_df_default_index, -) -> Tuple[ - pandas.DataFrame, Callable[[bigframes.Session], bigframes.dataframe.DataFrame] -]: - return ( - scalars_pandas_df_default_index.set_index("rowindex"), - lambda session: session.read_gbq(SCALARS_TABLE_ID, index_col=["rowindex"]), - ) - - -@pytest.fixture(autouse=True) -def mock_bigquery_client(monkeypatch, scalars_testdata_setup) -> bigquery.Client: - scalars_pandas_df, _ = scalars_testdata_setup - mock_client = mock.create_autospec(bigquery.Client) - # Constructor returns the mock itself, so this mock can be treated as the - # constructor or the instance. - mock_client.return_value = mock_client - mock_client.project = "default-project" - most_recent_table = None - - def mock_bigquery_client_get_table( - table_ref: Union[google.cloud.bigquery.table.TableReference, str] - ): - global most_recent_table - - if isinstance(table_ref, google.cloud.bigquery.table.TableReference): - table_name = table_ref.__str__() - else: - table_name = table_ref - - schema = [ - {"mode": "NULLABLE", "name": "rowindex", "type": "INTEGER"}, - { - "mode": "NULLABLE", - "name": "bigframes_ordering_id", - "type": "INTEGER", - }, - ] - - if table_name == SCALARS_TABLE_ID: - schema += [ - {"mode": "NULLABLE", "name": "bool_col", "type": "BOOL"}, - {"mode": "NULLABLE", "name": "int64_col", "type": "INTEGER"}, - {"mode": "NULLABLE", "name": "float64_col", "type": "FLOAT"}, - {"mode": "NULLABLE", "name": "string_col", "type": "STRING"}, - ] - else: - raise google.api_core.exceptions.NotFound("Not Found Table") - - most_recent_table = bigquery.Table(table_name, schema) # type: ignore - return most_recent_table # type: ignore - - def mock_query( - sql: str, - job_config: Optional[bigquery.QueryJobConfig] = None, - location: str = "US", - ) -> bigquery.QueryJob: - global most_recent_table - - def mock_result(max_results=None): - mock_rows = mock.create_autospec(google.cloud.bigquery.table.RowIterator) - mock_rows.total_rows = len(scalars_pandas_df.index) - mock_rows.schema = [ - bigquery.SchemaField(name=name, field_type="INT64") - for name in scalars_pandas_df.columns - ] - # Use scalars_pandas_df instead of ibis_expr.execute() to preserve dtypes. - mock_rows.to_dataframe.return_value = scalars_pandas_df.head(n=max_results) - return mock_rows - - mock_job = mock.create_autospec(bigquery.QueryJob) - mock_job.result = mock_result - return mock_job - - mock_client.get_table = mock_bigquery_client_get_table - mock_client.query.side_effect = mock_query - monkeypatch.setattr(bigquery, "Client", mock_client) - mock_client.reset_mock() - return mock_client - - -@pytest.fixture -def session() -> bigframes.Session: - return bigframes.Session( - context=bigframes.BigQueryOptions( - credentials=mock.create_autospec(google.oauth2.credentials.Credentials), - project="unit-test-project", - ) - ) - - -@pytest.fixture -def scalars_ibis_table(session) -> ibis_types.Table: - return session.ibis_client.table(SCALARS_TABLE_ID) diff --git a/tests/unit/core/test_utils.py b/tests/unit/core/test_bf_utils.py similarity index 100% rename from tests/unit/core/test_utils.py rename to tests/unit/core/test_bf_utils.py diff --git a/tests/unit/resources.py b/tests/unit/resources.py new file mode 100644 index 0000000000..c8ed6e86ed --- /dev/null +++ b/tests/unit/resources.py @@ -0,0 +1,73 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional +import unittest.mock as mock + +import google.auth.credentials +import google.cloud.bigquery +import ibis +import pandas + +import bigframes +import bigframes.core as core + +"""Utilities for creating test resources.""" + + +def create_bigquery_session( + bqclient: Optional[google.cloud.bigquery.Client] = None, session_id: str = "abcxyz" +) -> bigframes.Session: + credentials = mock.create_autospec( + google.auth.credentials.Credentials, instance=True + ) + + if bqclient is None: + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + + clients_provider = mock.create_autospec(bigframes.session.ClientsProvider) + type(clients_provider).bqclient = mock.PropertyMock(return_value=bqclient) + clients_provider._credentials = credentials + + bqoptions = bigframes.BigQueryOptions( + credentials=credentials, location="test-region" + ) + session = bigframes.Session(context=bqoptions, clients_provider=clients_provider) + session._session_id = session_id + return session + + +def create_pandas_session(tables: Dict[str, pandas.DataFrame]) -> bigframes.Session: + # TODO(tswast): Refactor to make helper available for all tests. Consider + # providing a proper "local Session" for use by downstream developers. + session = mock.create_autospec(bigframes.Session, instance=True) + ibis_client = ibis.pandas.connect(tables) + type(session).ibis_client = mock.PropertyMock(return_value=ibis_client) + return session + + +def create_arrayvalue( + df: pandas.DataFrame, total_ordering_columns: List[str] +) -> bigframes.core.ArrayValue: + session = create_pandas_session({"test_table": df}) + ibis_table = session.ibis_client.table("test_table") + columns = tuple(ibis_table[key] for key in ibis_table.columns) + ordering = core.ExpressionOrdering( + [core.OrderingColumnReference(column) for column in total_ordering_columns], + total_ordering_columns=frozenset(total_ordering_columns), + ) + return core.ArrayValue( + session=session, table=ibis_table, columns=columns, ordering=ordering + ) diff --git a/tests/unit/test_core.py b/tests/unit/test_core.py index 123dae7939..e01638e22e 100644 --- a/tests/unit/test_core.py +++ b/tests/unit/test_core.py @@ -13,39 +13,55 @@ # limitations under the License. import ibis -from ibis.expr.types import Table +import pandas -from bigframes import core +import bigframes.core as core -ORDERING = core.ExpressionOrdering( - [ - core.OrderingColumnReference("int64_col"), - core.OrderingColumnReference("string_col"), - ], - total_ordering_columns=frozenset(["int64_col", "string_col"]), -) +from . import resources -def test_constructor_from_ibis_table_adds_all_columns( - session, scalars_ibis_table: Table -): - columns = tuple(scalars_ibis_table[key] for key in scalars_ibis_table.columns) +def test_arrayvalue_constructor_from_ibis_table_adds_all_columns(): + session = resources.create_pandas_session( + { + "test_table": pandas.DataFrame( + { + "col1": [1, 2, 3], + "not_included": [True, False, True], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ) + } + ) + ibis_table = session.ibis_client.table("test_table") + columns = (ibis_table["col1"], ibis_table["col2"], ibis_table["col3"]) + ordering = core.ExpressionOrdering( + [core.OrderingColumnReference("col1")], + total_ordering_columns=frozenset(["col1"]), + ) actual = core.ArrayValue( - session=session, table=scalars_ibis_table, columns=columns, ordering=ORDERING + session=session, table=ibis_table, columns=columns, ordering=ordering ) - assert actual._table is scalars_ibis_table - assert len(actual._columns) == len(scalars_ibis_table.columns) + assert actual.table is ibis_table + assert len(actual.columns) == 3 -def test_to_ibis_expr_with_projection(session, scalars_ibis_table: Table): - columns = tuple(scalars_ibis_table[key] for key in scalars_ibis_table.columns) - expr = core.ArrayValue( - session=session, table=scalars_ibis_table, columns=columns, ordering=ORDERING - ).projection( +def test_arrayvalue_to_ibis_expr_with_projection(): + value = resources.create_arrayvalue( + pandas.DataFrame( + { + "col1": [1, 2, 3], + "col2": ["a", "b", "c"], + "col3": [0.1, 0.2, 0.3], + } + ), + total_ordering_columns=["col1"], + ) + expr = value.projection( [ - scalars_ibis_table["int64_col"], + (value.table["col1"] + ibis.literal(-1)).name("int64_col"), ibis.literal(123456789).name("literals"), - scalars_ibis_table["string_col"], + value.table["col2"].name("string_col"), ] ) actual = expr.to_ibis_expr() diff --git a/tests/unit/test_pandas.py b/tests/unit/test_pandas.py index a178a45438..2325fc96a0 100644 --- a/tests/unit/test_pandas.py +++ b/tests/unit/test_pandas.py @@ -17,12 +17,17 @@ import sys import unittest.mock as mock +import google.api_core.exceptions +import google.cloud.bigquery import pandas as pd import pytest +import bigframes.core.global_session import bigframes.pandas as bpd import bigframes.session +from . import resources + leading_whitespace = re.compile(r"^\s+", flags=re.MULTILINE) @@ -109,3 +114,37 @@ def test_pandas_attribute(): assert bpd.Int64Dtype is pd.Int64Dtype assert bpd.StringDtype is pd.StringDtype assert bpd.ArrowDtype is pd.ArrowDtype + + +def test_reset_session_after_bq_session_ended(monkeypatch): + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + session = resources.create_bigquery_session( + bqclient=bqclient, session_id="JUST_A_TEST" + ) + + # Simulate that the session has already expired. + # Note: this needs to be done after the Session is constructed, as the + # initializer sends a query to start the BigQuery Session. + query_job = mock.create_autospec(google.cloud.bigquery.QueryJob, instance=True) + query_job.result.side_effect = google.api_core.exceptions.BadRequest( + "Session JUST_A_TEST has expired and is no longer available." + ) + bqclient.query.return_value = query_job + + # Simulate that the session has already started. + monkeypatch.setattr(bigframes.core.global_session, "_global_session", session) + bpd.options.bigquery._session_started = True + + # Confirm that as a result bigframes.pandas interface is unusable + with pytest.raises( + google.api_core.exceptions.BadRequest, + match="Session JUST_A_TEST has expired and is no longer available.", + ): + bpd.read_gbq("SELECT 1") + + # Even though the query to stop the session raises an exception, we should + # still be able to reset it without raising an error to the user. + bpd.reset_session() + assert "CALL BQ.ABORT_SESSION('JUST_A_TEST')" in bqclient.query.call_args.args[0] + assert bigframes.core.global_session._global_session is None diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py index ab573c4c11..e39a316e5b 100644 --- a/tests/unit/test_session.py +++ b/tests/unit/test_session.py @@ -20,9 +20,13 @@ import bigframes +from . import resources + @pytest.mark.parametrize("missing_parts_table_id", [(""), ("table")]) -def test_read_gbq_missing_parts(session, missing_parts_table_id): +def test_read_gbq_missing_parts(missing_parts_table_id): + session = resources.create_bigquery_session() + with pytest.raises(ValueError): session.read_gbq(missing_parts_table_id) @@ -31,7 +35,14 @@ def test_read_gbq_missing_parts(session, missing_parts_table_id): "not_found_table_id", [("unknown.dataset.table"), ("project.unknown.table"), ("project.dataset.unknown")], ) -def test_read_gdb_not_found_tables(session, not_found_table_id): +def test_read_gdb_not_found_tables(not_found_table_id): + bqclient = mock.create_autospec(google.cloud.bigquery.Client, instance=True) + bqclient.project = "test-project" + bqclient.get_table.side_effect = google.api_core.exceptions.NotFound( + "table not found" + ) + session = resources.create_bigquery_session(bqclient=bqclient) + with pytest.raises(google.api_core.exceptions.NotFound): session.read_gbq(not_found_table_id) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 5a812dae7e..8c81b23b6c 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -380,11 +380,44 @@ def dropna( ) -> DataFrame: """Remove missing values. + Args: + axis ({0 or 'index', 1 or 'columns'}, default 'columns'): + Determine if rows or columns which contain missing values are + removed. + + * 0, or 'index' : Drop rows which contain missing values. + * 1, or 'columns' : Drop columns which contain missing value. + how ({'any', 'all'}, default 'any'): + Determine if row or column is removed from DataFrame, when we have + at least one NA or all NA. + + * 'any' : If any NA values are present, drop that row or column. + * 'all' : If all values are NA, drop that row or column. + ignore_index (bool, default ``False``): + If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. + + Returns: bigframes.dataframe.DataFrame: DataFrame with NA entries dropped from it. """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def isin(self, values): + """ + Whether each element in the DataFrame is contained in values. + + Args: + values (iterable, or dict): + The result will only be true at a location if all the + labels match. If `values` is a dict, the keys must be + the column names, which must match. + + Returns: + DataFrame: DataFrame of booleans showing whether each element + in the DataFrame is contained in values. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Sorting @@ -808,6 +841,54 @@ def rmod(self, other, axis: str | int = "columns") -> DataFrame: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def pow(self, other, axis: str | int = "columns") -> DataFrame: + """Get Exponential power of dataframe and other, element-wise (binary operator `pow`). + + Equivalent to ``dataframe ** other``, but with support to substitute a fill_value + for missing data in one of the inputs. With reverse version, `rpow`. + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to + arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + + .. note:: + Mismatched indices will be unioned together. + + Args: + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def rpow(self, other, axis: str | int = "columns") -> DataFrame: + """Get Exponential power of dataframe and other, element-wise (binary operator `rpow`). + + Equivalent to ``other ** dataframe``, but with support to substitute a fill_value + for missing data in one of the inputs. With reverse version, `pow`. + + Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to + arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**`. + + .. note:: + Mismatched indices will be unioned together. + + Args: + other (float, int, or Series): + Any single or multiple element data structure, or list-like object. + axis ({0 or 'index', 1 or 'columns'}): + Whether to compare by the index (0 or 'index') or columns. + (1 or 'columns'). For Series input, axis to match Series index on. + + Returns: + DataFrame: DataFrame result of the arithmetic operation. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + # ---------------------------------------------------------------------- # Data reshaping @@ -1338,3 +1419,20 @@ def value_counts( Series: Series containing counts of unique rows in the DataFrame """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def fillna(self, value): + """ + Fill NA/NaN values using the specified method. + + Args: + value (scalar, Series): + Value to use to fill holes (e.g. 0), alternately a + Series of values specifying which value to use for + each index (for a Series) or column (for a DataFrame). Values not + in the Series will not be filled. This value cannot + be a list. + + Returns: + DataFrame: Object with missing values filled + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) diff --git a/third_party/bigframes_vendored/pandas/core/reshape/merge.py b/third_party/bigframes_vendored/pandas/core/reshape/merge.py new file mode 100644 index 0000000000..ee02d698da --- /dev/null +++ b/third_party/bigframes_vendored/pandas/core/reshape/merge.py @@ -0,0 +1,78 @@ +# Contains code from https://github.com/pandas-dev/pandas/blob/main/pandas/core/reshape/merge.py +""" +SQL-style merge routines +""" +from __future__ import annotations + + +def merge( + left, + right, + how="inner", + on=None, + *, + left_on=None, + right_on=None, + sort=False, + suffixes=("_x", "_y"), +): + + """ + Merge DataFrame objects with a database-style join. + + The join is done on columns or indexes. If joining columns on + columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes + on indexes or indexes on a column or columns, the index will be passed on. + When performing a cross merge, no column specifications to merge on are + allowed. + + .. note:: + A named Series object is treated as a DataFrame with a single named column. + + .. warning:: + If both key columns contain rows where the key is a null value, those + rows will be matched against each other. This is different from usual SQL + join behaviour and can lead to unexpected results. + + Args: + left: + The primary object to be merged. + right: + Object to merge with. + how: + ``{'left', 'right', 'outer', 'inner'}, default 'inner'`` + Type of merge to be performed. + ``left``: use only keys from left frame, similar to a SQL left outer join; + preserve key order. + ``right``: use only keys from right frame, similar to a SQL right outer join; + preserve key order. + ``outer``: use union of keys from both frames, similar to a SQL full outer + join; sort keys lexicographically. + ``inner``: use intersection of keys from both frames, similar to a SQL inner + join; preserve the order of the left keys. + + on: + Column join on. It must be found in both DataFrames. Either on or left_on + right_on + must be passed in. + left_on: + Column join on in the left DataFrame. Either on or left_on + right_on + must be passed in. + right_on: + Column join on in the right DataFrame. Either on or left_on + right_on + must be passed in. + sort: + Default False. Sort the join keys lexicographically in the + result DataFrame. If False, the order of the join keys depends + on the join type (how keyword). + suffixes: + Default ``("_x", "_y")``. A length-2 sequence where each + element is optionally a string indicating the suffix to add to + overlapping column names in `left` and `right` respectively. + Pass a value of `None` instead of a string to indicate that the + column name from `left` or `right` should be left as-is, with + no suffix. At least one of the values must not be None. + + Returns: + bigframes.dataframe.DataFrame: A DataFrame of the two merged objects. + """ + raise NotImplementedError("abstract method") diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 8d505c1ead..76fb46a700 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -207,10 +207,16 @@ def to_dict(self, into: type[dict] = dict) -> Mapping: """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - def to_frame(self) -> DataFrame: + def to_frame(self, name=None) -> DataFrame: """ Convert Series to DataFrame. + The column in the new dataframe will be named name (the keyword parameter) + if the name parameter is provided and not None. + + Args: + name (Hashable, default None) + Returns: bigframes.dataframe.DataFrame: DataFrame representation of Series. """ @@ -830,6 +836,23 @@ def fillna( """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def dropna(self, *, axis=0, inplace: bool = False, how=None) -> Series: + """ + Return a new Series with missing values removed. + + Args: + axis (0 or 'index'): + Unused. Parameter needed for compatibility with DataFrame. + inplace (bool, default False): + Unsupported, do not set. + how (str, optional): + Not in use. Kept for compatibility. + + Returns: + Series: Series with NA entries dropped from it. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + def between( self, left, @@ -1185,9 +1208,39 @@ def mod(self, other) -> Series: raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) def rmod(self, other) -> Series: - """Get modulo of Series and other, element-wise (binary operator `rmod`). + """Return modulo of Series and other, element-wise (binary operator mod). + + Equivalent to ``series % other``, but with support to substitute a fill_value for + missing data in either one of the inputs. + + Args: + other (Series, or scalar value): + + Returns: + bigframes.series.Series: The result of the operation. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) - Equivalent to ``other % series``, but with support to substitute a fill_value for + def pow(self, other) -> Series: + """Return Exponential power of series and other, element-wise (binary operator `pow`). + + Equivalent to ``series ** other``, but with support to substitute a fill_value for + missing data in either one of the inputs. + + Args: + other (Series, or scalar value): + + Returns: + bigframes.series.Series: The result of the operation. + + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def rpow(self, other) -> Series: + """Return Exponential power of series and other, element-wise (binary operator `rpow`). + + Equivalent to ``other ** series``, but with support to substitute a fill_value for missing data in either one of the inputs. Args: @@ -1640,3 +1693,46 @@ def is_monotonic_decreasing(self) -> bool: bool """ raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) + + def map( + self, + arg, + na_action=None, + *, + verify_integrity=False, + ) -> Series: + """ + Map values of Series according to an input mapping or function. + + Used for substituting each value in a Series with another value, + that may be derived from a remote function, ``dict``, or a :class:`Series`. + + If arg is a remote function, the overhead for remote functions + applies. If mapping with a dict, fully deferred computation is possible. + If mapping with a Series, fully deferred computation is only possible if + verify_integrity=False. + + .. note:: + Bigframes does not yet support ``dict`` subclasses that define + ``__missing__`` (i.e. provide a method for default values). These + are treated the same as ``dict``. + + Args: + arg (function, Mapping, Series): + remote function, collections.abc.Mapping subclass or Series + Mapping correspondence. + na_action: (str, default None) + Only None is currently supported, indicating that arg may + map values to scalars. values won't be ignored. + Passing 'ignore' will raise NotImplementedException. + verify_integrity: (bool, default False) + Only applies when arg is a Series. If True, throw if the Series + index contains duplicate entries (this matches pandas behavior). + If False, skip the expensive computation, and any duplicate + index entries will produce duplicate rows in the result for each + index entry. + + Returns: + Series: Same index as caller. + """ + raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE) From 106c75c79e6ba33f613075053c84ac7fe7d22c2c Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 5 Sep 2023 06:20:31 -0500 Subject: [PATCH 7/7] chore(main): release 0.3.0 (#13) Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> --- CHANGELOG.md | 67 ++++++++++++++++++++++++++++++++++++++++++++ bigframes/version.py | 2 +- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ab2b05d57..7770534cad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,73 @@ [1]: https://pypi.org/project/bigframes/#history +## [0.3.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v0.2.0...v0.3.0) (2023-09-02) + + +### Features + +* Add `bigframes.get_global_session()` and `bigframes.reset_session()` aliases ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Add `bigframes.pandas.read_pickle` function ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Add `components_`, `explained_variance_`, and `explained_variance_ratio_` properties to `bigframes.ml.decomposition.PCA` ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Add `fit_transform` to `bigquery.ml` transformers ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Add `Series.dropna` and `DataFrame.fillna` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Add `Series.str` methods `isalpha`, `isdigit`, `isdecimal`, `isalnum`, `isspace`, `islower`, `isupper`, `zfill`, `center` ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Support `bigframes.pandas.merge()` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support `DataFrame.isin` with list and dict inputs ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support `DataFrame.pivot` ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Support `DataFrame.stack` ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Support `DataFrame`-`DataFrame` binary operations ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support `df[my_column] = [a python list]` ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Support `Index.is_monotonic` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support `np.arcsin`, `np.arccos`, `np.arctan`, `np.sinh`, `np.cosh`, `np.tanh`, `np.arcsinh`, `np.arccosh`, `np.arctanh`, `np.exp` with Series argument ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Support `np.sin`, `np.cos`, `np.tan`, `np.log`, `np.log10`, `np.sqrt`, `np.abs` with Series argument ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Support `pow()` and power operator in `DataFrame` and `Series` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support `read_json` with `engine=bigquery` for newline-delimited JSON files ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Support `Series.corr` ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Support `Series.map` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support for `np.add`, `np.subtract`, `np.multiply`, `np.divide`, `np.power` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Support MultiIndex for DataFrame columns ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Use `pandas.Index` for column labels ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Use default session and connection in `ml.llm` and `ml.imported` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) + + +### Bug Fixes + +* Add error message to `set_index` ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Align column names with pandas in `DataFrame.agg` results ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Allow (but still not recommended) `ORDER BY` in `read_gbq` input when an `index_col` is defined ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Check for IAM role on the BigQuery connection when initializing a `remote_function` ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Check that types are specified in `read_gbq_function` ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Don't use query cache for Session construction ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Include survey link in abstract `NotImplementedError` exception messages ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Label temp table creation jobs with `source=bigquery-dataframes-temp` label ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Make `X_train` argument names consistent across methods ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) +* Raise AttributeError for unimplemented pandas methods ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Raise exception for invalid function in `read_gbq_function` ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Support spaces in column names in `DataFrame` initializater ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) + + +### Performance Improvements + +* Add local cache for `__repr_*__` methods ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Lazily instantiate client library objects ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Use `row_number()` filter for `head` / `tail` ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) + + +### Documentation + +* Add ML section under Overview ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Add release status to table of contents ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Add samples and best practices to `read_gbq` docs ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Correct the return types of Dataframe and Series ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Create subfolders for notebooks ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Fix link to GitHub ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Highlight bigframes is open-source ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Sample ML Drug Name Generation notebook ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Set `options.bigquery.project` in sample code ([89b9503](https://github.com/googleapis/python-bigquery-dataframes/commit/89b95033d6b449bfc21249057d7c024d096c80d0)) +* Transform remote function user guide into sample code ([a32b747](https://github.com/googleapis/python-bigquery-dataframes/commit/a32b74751785c8e8aec40ce01df639dd7c4fbb77)) +* Update remote function notebook with read_gbq_function usage ([8fab755](https://github.com/googleapis/python-bigquery-dataframes/commit/8fab75576757230bca5c7df10994837ac406300f)) + ## 0.2.0 (2023-08-17) ### Features diff --git a/bigframes/version.py b/bigframes/version.py index d386742d59..4cc4639705 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.2.0" +__version__ = "0.3.0"