diff --git a/CHANGELOG.md b/CHANGELOG.md index bfa43f7fc4..fc4362cc87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,15 @@ [1]: https://pypi.org/project/bigframes/#history +## [2.17.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.16.0...v2.17.0) (2025-08-22) + + +### Features + +* Add isin local execution impl ([#1993](https://github.com/googleapis/python-bigquery-dataframes/issues/1993)) ([26df6e6](https://github.com/googleapis/python-bigquery-dataframes/commit/26df6e691bb27ed09322a81214faedbf3639b32e)) +* Add reset_index names, col_level, col_fill, allow_duplicates args ([#2017](https://github.com/googleapis/python-bigquery-dataframes/issues/2017)) ([c02a1b6](https://github.com/googleapis/python-bigquery-dataframes/commit/c02a1b67d27758815430bb8006ac3a72cea55a89)) +* Support callable for series mask method ([#2014](https://github.com/googleapis/python-bigquery-dataframes/issues/2014)) ([5ac32eb](https://github.com/googleapis/python-bigquery-dataframes/commit/5ac32ebe17cfda447870859f5dd344b082b4d3d0)) + ## [2.16.0](https://github.com/googleapis/python-bigquery-dataframes/compare/v2.15.0...v2.16.0) (2025-08-20) diff --git a/bigframes/core/blocks.py b/bigframes/core/blocks.py index d2662da509..1a2544704c 100644 --- a/bigframes/core/blocks.py +++ b/bigframes/core/blocks.py @@ -387,12 +387,21 @@ def reversed(self) -> Block: index_labels=self.index.names, ) - def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: + def reset_index( + self, + level: LevelsType = None, + drop: bool = True, + *, + col_level: Union[str, int] = 0, + col_fill: typing.Hashable = "", + allow_duplicates: bool = False, + ) -> Block: """Reset the index of the block, promoting the old index to a value column. Arguments: level: the label or index level of the index levels to remove. name: this is the column id for the new value id derived from the old index + allow_duplicates: Returns: A new Block because dropping index columns can break references @@ -438,6 +447,11 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: ) else: # Add index names to column index + col_level_n = ( + col_level + if isinstance(col_level, int) + else self.column_labels.names.index(col_level) + ) column_labels_modified = self.column_labels for position, level_id in enumerate(level_ids): label = self.col_id_to_index_name[level_id] @@ -447,11 +461,15 @@ def reset_index(self, level: LevelsType = None, drop: bool = True) -> Block: else: label = f"level_{self.index_columns.index(level_id)}" - if label in self.column_labels: + if (not allow_duplicates) and (label in self.column_labels): raise ValueError(f"cannot insert {label}, already exists") + if isinstance(self.column_labels, pd.MultiIndex): nlevels = self.column_labels.nlevels - label = tuple(label if i == 0 else "" for i in range(nlevels)) + label = tuple( + label if i == col_level_n else col_fill for i in range(nlevels) + ) + # Create index copy with label inserted # See: https://pandas.pydata.org/docs/reference/api/pandas.Index.insert.html column_labels_modified = column_labels_modified.insert(position, label) diff --git a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py index bc077c1ce3..f3653efc56 100644 --- a/bigframes/core/compile/ibis_compiler/scalar_op_registry.py +++ b/bigframes/core/compile/ibis_compiler/scalar_op_registry.py @@ -1062,7 +1062,7 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp): if op.match_nulls and contains_nulls: return x.isnull() | x.isin(matchable_ibis_values) else: - return x.isin(matchable_ibis_values) + return x.isin(matchable_ibis_values).fillna(False) @scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True) diff --git a/bigframes/core/compile/polars/compiler.py b/bigframes/core/compile/polars/compiler.py index 8ae896816f..1ba76dee5b 100644 --- a/bigframes/core/compile/polars/compiler.py +++ b/bigframes/core/compile/polars/compiler.py @@ -263,11 +263,9 @@ def _(self, op: ops.ScalarOp, l_input: pl.Expr, r_input: pl.Expr) -> pl.Expr: def _(self, op: ops.ScalarOp, input: pl.Expr) -> pl.Expr: # TODO: Filter out types that can't be coerced to right type assert isinstance(op, gen_ops.IsInOp) - if op.match_nulls or not any(map(pd.isna, op.values)): - # newer polars version have nulls_equal arg - return input.is_in(op.values) - else: - return input.is_in(op.values) or input.is_null() + assert not op.match_nulls # should be stripped by a lowering step rn + values = pl.Series(op.values, strict=False) + return input.is_in(values) @compile_op.register(gen_ops.FillNaOp) @compile_op.register(gen_ops.CoalesceOp) diff --git a/bigframes/core/compile/polars/lowering.py b/bigframes/core/compile/polars/lowering.py index f6ed6c676c..876ff2794f 100644 --- a/bigframes/core/compile/polars/lowering.py +++ b/bigframes/core/compile/polars/lowering.py @@ -13,8 +13,10 @@ # limitations under the License. import dataclasses +from typing import cast import numpy as np +import pandas as pd from bigframes import dtypes from bigframes.core import bigframe_node, expression @@ -316,6 +318,35 @@ def lower(self, expr: expression.OpExpression) -> expression.Expression: return expr +class LowerIsinOp(op_lowering.OpLoweringRule): + @property + def op(self) -> type[ops.ScalarOp]: + return generic_ops.IsInOp + + def lower(self, expr: expression.OpExpression) -> expression.Expression: + assert isinstance(expr.op, generic_ops.IsInOp) + arg = expr.children[0] + new_values = [] + match_nulls = False + for val in expr.op.values: + # coercible, non-coercible + # float NaN/inf should be treated as distinct from 'true' null values + if cast(bool, pd.isna(val)) and not isinstance(val, float): + if expr.op.match_nulls: + match_nulls = True + elif dtypes.is_compatible(val, arg.output_type): + new_values.append(val) + else: + pass + + new_isin = ops.IsInOp(tuple(new_values), match_nulls=False).as_expr(arg) + if match_nulls: + return ops.coalesce_op.as_expr(new_isin, expression.const(True)) + else: + # polars propagates nulls, so need to coalesce to false + return ops.coalesce_op.as_expr(new_isin, expression.const(False)) + + def _coerce_comparables( expr1: expression.Expression, expr2: expression.Expression, @@ -414,6 +445,7 @@ def _lower_cast(cast_op: ops.AsTypeOp, arg: expression.Expression): LowerModRule(), LowerAsTypeRule(), LowerInvertOp(), + LowerIsinOp(), ) diff --git a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py index 61f1eba607..3fcba04cfd 100644 --- a/bigframes/core/compile/sqlglot/expressions/binary_compiler.py +++ b/bigframes/core/compile/sqlglot/expressions/binary_compiler.py @@ -38,21 +38,15 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.Concat(expressions=[left.expr, right.expr]) if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): - left_expr = left.expr - if left.dtype == dtypes.BOOL_DTYPE: - left_expr = sge.Cast(this=left_expr, to="INT64") - right_expr = right.expr - if right.dtype == dtypes.BOOL_DTYPE: - right_expr = sge.Cast(this=right_expr, to="INT64") + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) return sge.Add(this=left_expr, expression=right_expr) if ( dtypes.is_time_or_date_like(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE ): - left_expr = left.expr - if left.dtype == dtypes.DATE_DTYPE: - left_expr = sge.Cast(this=left_expr, to="DATETIME") + left_expr = _coerce_date_to_datetime(left) return sge.TimestampAdd( this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND") ) @@ -60,9 +54,7 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: dtypes.is_time_or_date_like(right.dtype) and left.dtype == dtypes.TIMEDELTA_DTYPE ): - right_expr = right.expr - if right.dtype == dtypes.DATE_DTYPE: - right_expr = sge.Cast(this=right_expr, to="DATETIME") + right_expr = _coerce_date_to_datetime(right) return sge.TimestampAdd( this=right_expr, expression=left.expr, unit=sge.Var(this="MICROSECOND") ) @@ -74,14 +66,37 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: ) -@BINARY_OP_REGISTRATION.register(ops.div_op) +@BINARY_OP_REGISTRATION.register(ops.eq_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.EQ(this=left_expr, expression=right_expr) + + +@BINARY_OP_REGISTRATION.register(ops.eq_null_match_op) def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: left_expr = left.expr - if left.dtype == dtypes.BOOL_DTYPE: - left_expr = sge.Cast(this=left_expr, to="INT64") + if right.dtype != dtypes.BOOL_DTYPE: + left_expr = _coerce_bool_to_int(left) + right_expr = right.expr - if right.dtype == dtypes.BOOL_DTYPE: - right_expr = sge.Cast(this=right_expr, to="INT64") + if left.dtype != dtypes.BOOL_DTYPE: + right_expr = _coerce_bool_to_int(right) + + sentinel = sge.convert("$NULL_SENTINEL$") + left_coalesce = sge.Coalesce( + this=sge.Cast(this=left_expr, to="STRING"), expressions=[sentinel] + ) + right_coalesce = sge.Coalesce( + this=sge.Cast(this=right_expr, to="STRING"), expressions=[sentinel] + ) + return sge.EQ(this=left_coalesce, expression=right_coalesce) + + +@BINARY_OP_REGISTRATION.register(ops.div_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) result = sge.func("IEEE_DIVIDE", left_expr, right_expr) if left.dtype == dtypes.TIMEDELTA_DTYPE and dtypes.is_numeric(right.dtype): @@ -92,12 +107,8 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: @BINARY_OP_REGISTRATION.register(ops.floordiv_op) def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = left.expr - if left.dtype == dtypes.BOOL_DTYPE: - left_expr = sge.Cast(this=left_expr, to="INT64") - right_expr = right.expr - if right.dtype == dtypes.BOOL_DTYPE: - right_expr = sge.Cast(this=right_expr, to="INT64") + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) result: sge.Expression = sge.Cast( this=sge.Floor(this=sge.func("IEEE_DIVIDE", left_expr, right_expr)), to="INT64" @@ -129,7 +140,16 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: @BINARY_OP_REGISTRATION.register(ops.ge_op) def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: - return sge.GTE(this=left.expr, expression=right.expr) + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.GTE(this=left_expr, expression=right_expr) + + +@BINARY_OP_REGISTRATION.register(ops.gt_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.GT(this=left_expr, expression=right_expr) @BINARY_OP_REGISTRATION.register(ops.JSONSet) @@ -137,14 +157,24 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: return sge.func("JSON_SET", left.expr, sge.convert(op.json_path), right.expr) +@BINARY_OP_REGISTRATION.register(ops.lt_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.LT(this=left_expr, expression=right_expr) + + +@BINARY_OP_REGISTRATION.register(ops.le_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.LTE(this=left_expr, expression=right_expr) + + @BINARY_OP_REGISTRATION.register(ops.mul_op) def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: - left_expr = left.expr - if left.dtype == dtypes.BOOL_DTYPE: - left_expr = sge.Cast(this=left_expr, to="INT64") - right_expr = right.expr - if right.dtype == dtypes.BOOL_DTYPE: - right_expr = sge.Cast(this=right_expr, to="INT64") + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) result = sge.Mul(this=left_expr, expression=right_expr) @@ -156,36 +186,38 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: return result +@BINARY_OP_REGISTRATION.register(ops.ne_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) + return sge.NEQ(this=left_expr, expression=right_expr) + + +@BINARY_OP_REGISTRATION.register(ops.obj_make_ref_op) +def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: + return sge.func("OBJ.MAKE_REF", left.expr, right.expr) + + @BINARY_OP_REGISTRATION.register(ops.sub_op) def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: if dtypes.is_numeric(left.dtype) and dtypes.is_numeric(right.dtype): - left_expr = left.expr - if left.dtype == dtypes.BOOL_DTYPE: - left_expr = sge.Cast(this=left_expr, to="INT64") - right_expr = right.expr - if right.dtype == dtypes.BOOL_DTYPE: - right_expr = sge.Cast(this=right_expr, to="INT64") + left_expr = _coerce_bool_to_int(left) + right_expr = _coerce_bool_to_int(right) return sge.Sub(this=left_expr, expression=right_expr) if ( dtypes.is_time_or_date_like(left.dtype) and right.dtype == dtypes.TIMEDELTA_DTYPE ): - left_expr = left.expr - if left.dtype == dtypes.DATE_DTYPE: - left_expr = sge.Cast(this=left_expr, to="DATETIME") + left_expr = _coerce_date_to_datetime(left) return sge.TimestampSub( this=left_expr, expression=right.expr, unit=sge.Var(this="MICROSECOND") ) if dtypes.is_time_or_date_like(left.dtype) and dtypes.is_time_or_date_like( right.dtype ): - left_expr = left.expr - if left.dtype == dtypes.DATE_DTYPE: - left_expr = sge.Cast(this=left_expr, to="DATETIME") - right_expr = right.expr - if right.dtype == dtypes.DATE_DTYPE: - right_expr = sge.Cast(this=right_expr, to="DATETIME") + left_expr = _coerce_date_to_datetime(left) + right_expr = _coerce_date_to_datetime(right) return sge.TimestampDiff( this=left_expr, expression=right_expr, unit=sge.Var(this="MICROSECOND") ) @@ -198,6 +230,15 @@ def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: ) -@BINARY_OP_REGISTRATION.register(ops.obj_make_ref_op) -def _(op, left: TypedExpr, right: TypedExpr) -> sge.Expression: - return sge.func("OBJ.MAKE_REF", left.expr, right.expr) +def _coerce_bool_to_int(typed_expr: TypedExpr) -> sge.Expression: + """Coerce boolean expression to integer.""" + if typed_expr.dtype == dtypes.BOOL_DTYPE: + return sge.Cast(this=typed_expr.expr, to="INT64") + return typed_expr.expr + + +def _coerce_date_to_datetime(typed_expr: TypedExpr) -> sge.Expression: + """Coerce date expression to datetime.""" + if typed_expr.dtype == dtypes.DATE_DTYPE: + return sge.Cast(this=typed_expr.expr, to="DATETIME") + return typed_expr.expr diff --git a/bigframes/dataframe.py b/bigframes/dataframe.py index c58cbaba6a..921893fb83 100644 --- a/bigframes/dataframe.py +++ b/bigframes/dataframe.py @@ -2321,6 +2321,10 @@ def reset_index( level: blocks.LevelsType = ..., drop: bool = ..., inplace: Literal[False] = ..., + col_level: Union[int, str] = ..., + col_fill: Hashable = ..., + allow_duplicates: Optional[bool] = ..., + names: Union[None, Hashable, Sequence[Hashable]] = ..., ) -> DataFrame: ... @@ -2330,19 +2334,56 @@ def reset_index( level: blocks.LevelsType = ..., drop: bool = ..., inplace: Literal[True] = ..., + col_level: Union[int, str] = ..., + col_fill: Hashable = ..., + allow_duplicates: Optional[bool] = ..., + names: Union[None, Hashable, Sequence[Hashable]] = ..., ) -> None: ... @overload def reset_index( - self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = ... + self, + level: blocks.LevelsType = None, + drop: bool = False, + inplace: bool = ..., + col_level: Union[int, str] = ..., + col_fill: Hashable = ..., + allow_duplicates: Optional[bool] = ..., + names: Union[None, Hashable, Sequence[Hashable]] = ..., ) -> Optional[DataFrame]: ... def reset_index( - self, level: blocks.LevelsType = None, drop: bool = False, inplace: bool = False + self, + level: blocks.LevelsType = None, + drop: bool = False, + inplace: bool = False, + col_level: Union[int, str] = 0, + col_fill: Hashable = "", + allow_duplicates: Optional[bool] = None, + names: Union[None, Hashable, Sequence[Hashable]] = None, ) -> Optional[DataFrame]: - block = self._block.reset_index(level, drop) + block = self._block + if names: + if isinstance(names, blocks.Label) and not isinstance(names, tuple): + names = [names] + else: + names = list(names) + + if len(names) != self.index.nlevels: + raise ValueError("'names' must be same length as levels") + + block = block.with_index_labels(names) + if allow_duplicates is None: + allow_duplicates = False + block = block.reset_index( + level, + drop, + col_level=col_level, + col_fill=col_fill, + allow_duplicates=allow_duplicates, + ) if inplace: self._set_block(block) return None @@ -2755,11 +2796,11 @@ def isin(self, values) -> DataFrame: False, label=label, dtype=pandas.BooleanDtype() ) result_ids.append(result_id) - return DataFrame(block.select_columns(result_ids)).fillna(value=False) + return DataFrame(block.select_columns(result_ids)) elif utils.is_list_like(values): return self._apply_unary_op( ops.IsInOp(values=tuple(values), match_nulls=True) - ).fillna(value=False) + ) else: raise TypeError( "only list-like objects are allowed to be passed to " diff --git a/bigframes/series.py b/bigframes/series.py index 6f48935ec9..80952f38bc 100644 --- a/bigframes/series.py +++ b/bigframes/series.py @@ -414,6 +414,7 @@ def reset_index( name: typing.Optional[str] = ..., drop: Literal[False] = ..., inplace: Literal[False] = ..., + allow_duplicates: Optional[bool] = ..., ) -> bigframes.dataframe.DataFrame: ... @@ -425,6 +426,7 @@ def reset_index( name: typing.Optional[str] = ..., drop: Literal[True] = ..., inplace: Literal[False] = ..., + allow_duplicates: Optional[bool] = ..., ) -> Series: ... @@ -436,6 +438,7 @@ def reset_index( name: typing.Optional[str] = ..., drop: bool = ..., inplace: Literal[True] = ..., + allow_duplicates: Optional[bool] = ..., ) -> None: ... @@ -447,8 +450,11 @@ def reset_index( name: typing.Optional[str] = None, drop: bool = False, inplace: bool = False, + allow_duplicates: Optional[bool] = None, ) -> bigframes.dataframe.DataFrame | Series | None: - block = self._block.reset_index(level, drop) + if allow_duplicates is None: + allow_duplicates = False + block = self._block.reset_index(level, drop, allow_duplicates=allow_duplicates) if drop: if inplace: self._set_block(block) @@ -2107,13 +2113,8 @@ def duplicated(self, keep: str = "first") -> Series: ) def mask(self, cond, other=None) -> Series: - if callable(cond): - if hasattr(cond, "bigframes_bigquery_function"): - cond = self.apply(cond) - else: - # For non-BigQuery function assume that it is applicable on Series - cond = self.apply(cond, by_row=False) - + cond = self._apply_callable(cond) + other = self._apply_callable(other) if not isinstance(cond, Series): raise TypeError( f"Only bigframes series condition is supported, received {type(cond).__name__}. " diff --git a/bigframes/session/polars_executor.py b/bigframes/session/polars_executor.py index 9c45a884e5..6e3f0ca10f 100644 --- a/bigframes/session/polars_executor.py +++ b/bigframes/session/polars_executor.py @@ -66,6 +66,7 @@ generic_ops.FillNaOp, generic_ops.CaseWhenOp, generic_ops.InvertOp, + generic_ops.IsInOp, generic_ops.IsNullOp, generic_ops.NotNullOp, ) diff --git a/bigframes/version.py b/bigframes/version.py index 6b84e2eb1d..b9aa5d1855 100644 --- a/bigframes/version.py +++ b/bigframes/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.16.0" +__version__ = "2.17.0" # {x-release-please-start-date} -__release_date__ = "2025-08-20" +__release_date__ = "2025-08-22" # {x-release-please-end} diff --git a/tests/system/large/functions/test_managed_function.py b/tests/system/large/functions/test_managed_function.py index 262f5f0fe2..43fb322567 100644 --- a/tests/system/large/functions/test_managed_function.py +++ b/tests/system/large/functions/test_managed_function.py @@ -1077,7 +1077,7 @@ def func_for_other(x): ) -def test_managed_function_series_where(session, dataset_id, scalars_dfs): +def test_managed_function_series_where_mask(session, dataset_id, scalars_dfs): try: # The return type has to be bool type for callable where condition. @@ -1098,8 +1098,8 @@ def _is_positive(s): pd_int64 = scalars_pandas["int64_col"] pd_int64_filtered = pd_int64.dropna() - # The cond is a callable (managed function) and the other is not a - # callable in series.where method. + # Test series.where method: the cond is a callable (managed function) + # and the other is not a callable. bf_result = bf_int64_filtered.where( cond=is_positive_mf, other=-bf_int64_filtered ).to_pandas() @@ -1108,6 +1108,16 @@ def _is_positive(s): # Ignore any dtype difference. pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + # Test series.mask method: the cond is a callable (managed function) + # and the other is not a callable. + bf_result = bf_int64_filtered.mask( + cond=is_positive_mf, other=-bf_int64_filtered + ).to_pandas() + pd_result = pd_int64_filtered.mask(cond=_is_positive, other=-pd_int64_filtered) + + # Ignore any dtype difference. + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: # Clean up the gcp assets created for the managed function. cleanup_function_assets(is_positive_mf, session.bqclient, ignore_failures=False) diff --git a/tests/system/large/functions/test_remote_function.py b/tests/system/large/functions/test_remote_function.py index 9e2c1e2c81..1c44b7e5fb 100644 --- a/tests/system/large/functions/test_remote_function.py +++ b/tests/system/large/functions/test_remote_function.py @@ -2933,7 +2933,7 @@ def func_for_other(x): @pytest.mark.flaky(retries=2, delay=120) -def test_remote_function_series_where(session, dataset_id, scalars_dfs): +def test_remote_function_series_where_mask(session, dataset_id, scalars_dfs): try: def _ten_times(x): @@ -2954,8 +2954,8 @@ def _ten_times(x): pd_int64 = scalars_pandas["float64_col"] pd_int64_filtered = pd_int64.dropna() - # The cond is not a callable and the other is a callable (remote - # function) in series.where method. + # Test series.where method: the cond is not a callable and the other is + # a callable (remote function). bf_result = bf_int64_filtered.where( cond=bf_int64_filtered < 0, other=ten_times_mf ).to_pandas() @@ -2966,6 +2966,16 @@ def _ten_times(x): # Ignore any dtype difference. pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + # Test series.mask method: the cond is not a callable and the other is + # a callable (remote function). + bf_result = bf_int64_filtered.mask( + cond=bf_int64_filtered < 0, other=ten_times_mf + ).to_pandas() + pd_result = pd_int64_filtered.mask(cond=pd_int64_filtered < 0, other=_ten_times) + + # Ignore any dtype difference. + pandas.testing.assert_series_equal(bf_result, pd_result, check_dtype=False) + finally: # Clean up the gcp assets created for the remote function. cleanup_function_assets(ten_times_mf, session.bqclient, ignore_failures=False) diff --git a/tests/system/small/engines/test_comparison_ops.py b/tests/system/small/engines/test_comparison_ops.py index fefff93f58..0fcc48b10a 100644 --- a/tests/system/small/engines/test_comparison_ops.py +++ b/tests/system/small/engines/test_comparison_ops.py @@ -48,7 +48,7 @@ def apply_op_pairwise( return new_arr -@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +@pytest.mark.parametrize("engine", ["polars", "bq", "bq-sqlglot"], indirect=True) @pytest.mark.parametrize( "op", [ diff --git a/tests/system/small/engines/test_generic_ops.py b/tests/system/small/engines/test_generic_ops.py index 9fdb6bca78..1d28c335a6 100644 --- a/tests/system/small/engines/test_generic_ops.py +++ b/tests/system/small/engines/test_generic_ops.py @@ -390,3 +390,36 @@ def test_engines_invert_op(scalars_array_value: array_value.ArrayValue, engine): ) assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) + + +@pytest.mark.parametrize("engine", ["polars", "bq"], indirect=True) +def test_engines_isin_op(scalars_array_value: array_value.ArrayValue, engine): + arr, col_ids = scalars_array_value.compute_values( + [ + ops.IsInOp((1, 2, 3)).as_expr(expression.deref("int64_col")), + ops.IsInOp((None, 123456)).as_expr(expression.deref("int64_col")), + ops.IsInOp((None, 123456), match_nulls=False).as_expr( + expression.deref("int64_col") + ), + ops.IsInOp((1.0, 2.0, 3.0)).as_expr(expression.deref("int64_col")), + ops.IsInOp(("1.0", "2.0")).as_expr(expression.deref("int64_col")), + ops.IsInOp(("1.0", 2.5, 3)).as_expr(expression.deref("int64_col")), + ops.IsInOp(()).as_expr(expression.deref("int64_col")), + ops.IsInOp((1, 2, 3, None)).as_expr(expression.deref("float64_col")), + ] + ) + new_names = ( + "int in ints", + "int in ints w null", + "int in ints w null wo match nulls", + "int in floats", + "int in strings", + "int in mixed", + "int in empty", + "float in ints", + ) + arr = arr.rename_columns( + {old_name: new_names[i] for i, old_name in enumerate(col_ids)} + ) + + assert_equivalence_execution(arr.node, REFERENCE_ENGINE, engine) diff --git a/tests/system/small/test_dataframe.py b/tests/system/small/test_dataframe.py index 3b70dec0e9..8a570ade45 100644 --- a/tests/system/small/test_dataframe.py +++ b/tests/system/small/test_dataframe.py @@ -1591,7 +1591,7 @@ def test_itertuples(scalars_df_index, index, name): assert bf_tuple == pd_tuple -def test_df_isin_list(scalars_dfs): +def test_df_isin_list_w_null(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = ["Hello, World!", 55555, 2.51, pd.NA, True] bf_result = ( @@ -1606,6 +1606,21 @@ def test_df_isin_list(scalars_dfs): pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) +def test_df_isin_list_wo_null(scalars_dfs): + scalars_df, scalars_pandas_df = scalars_dfs + values = ["Hello, World!", 55555, 2.51, True] + bf_result = ( + scalars_df[["int64_col", "float64_col", "string_col", "bool_col"]] + .isin(values) + .to_pandas() + ) + pd_result = scalars_pandas_df[ + ["int64_col", "float64_col", "string_col", "bool_col"] + ].isin(values) + + pandas.testing.assert_frame_equal(bf_result, pd_result.astype("boolean")) + + def test_df_isin_dict(scalars_dfs): scalars_df, scalars_pandas_df = scalars_dfs values = { @@ -2070,6 +2085,32 @@ def test_reset_index(scalars_df_index, scalars_pandas_df_index, drop): pandas.testing.assert_frame_equal(bf_result, pd_result) +def test_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + df = scalars_df_index.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + scalars_pandas_df_index = scalars_pandas_df_index.copy() + scalars_pandas_df_index.index.name = "int64_col" + pd_result = scalars_pandas_df_index.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pandas.testing.assert_frame_equal(bf_result, pd_result) + + +def test_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index.copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + @pytest.mark.parametrize( ("drop",), ((True,), (False,)), diff --git a/tests/system/small/test_multiindex.py b/tests/system/small/test_multiindex.py index 0c23ea97ae..f15b8d8b21 100644 --- a/tests/system/small/test_multiindex.py +++ b/tests/system/small/test_multiindex.py @@ -929,16 +929,30 @@ def test_column_multi_index_rename(scalars_df_index, scalars_pandas_df_index): pandas.testing.assert_frame_equal(bf_result, pd_result) -def test_column_multi_index_reset_index(scalars_df_index, scalars_pandas_df_index): +@pytest.mark.parametrize( + ("names", "col_fill", "col_level"), + [ + (None, "", "l2"), + (("new_name"), "fill", 1), + ("new_name", "fill", 0), + ], +) +def test_column_multi_index_reset_index( + scalars_df_index, scalars_pandas_df_index, names, col_fill, col_level +): columns = ["int64_too", "int64_col", "float64_col"] - multi_columns = pandas.MultiIndex.from_tuples(zip(["a", "b", "a"], ["a", "b", "b"])) + multi_columns = pandas.MultiIndex.from_tuples( + zip(["a", "b", "a"], ["a", "b", "b"]), names=["l1", "l2"] + ) bf_df = scalars_df_index[columns].copy() bf_df.columns = multi_columns pd_df = scalars_pandas_df_index[columns].copy() pd_df.columns = multi_columns - bf_result = bf_df.reset_index().to_pandas() - pd_result = pd_df.reset_index() + bf_result = bf_df.reset_index( + names=names, col_fill=col_fill, col_level=col_level + ).to_pandas() + pd_result = pd_df.reset_index(names=names, col_fill=col_fill, col_level=col_level) # Pandas uses int64 instead of Int64 (nullable) dtype. pd_result.index = pd_result.index.astype(pandas.Int64Dtype()) diff --git a/tests/system/small/test_series.py b/tests/system/small/test_series.py index 2172962046..165e3b6df0 100644 --- a/tests/system/small/test_series.py +++ b/tests/system/small/test_series.py @@ -1339,6 +1339,32 @@ def test_reset_index_drop(scalars_df_index, scalars_pandas_df_index): pd.testing.assert_series_equal(bf_result.to_pandas(), pd_result) +def test_series_reset_index_allow_duplicates(scalars_df_index, scalars_pandas_df_index): + bf_series = scalars_df_index["int64_col"].copy() + bf_series.index.name = "int64_col" + df = bf_series.reset_index(allow_duplicates=True, drop=False) + assert df.index.name is None + + bf_result = df.to_pandas() + + pd_series = scalars_pandas_df_index["int64_col"].copy() + pd_series.index.name = "int64_col" + pd_result = pd_series.reset_index(allow_duplicates=True, drop=False) + + # Pandas uses int64 instead of Int64 (nullable) dtype. + pd_result.index = pd_result.index.astype(pd.Int64Dtype()) + + # reset_index should maintain the original ordering. + pd.testing.assert_frame_equal(bf_result, pd_result) + + +def test_series_reset_index_duplicates_error(scalars_df_index): + scalars_df_index = scalars_df_index["int64_col"].copy() + scalars_df_index.index.name = "int64_col" + with pytest.raises(ValueError): + scalars_df_index.reset_index(allow_duplicates=False, drop=False) + + def test_series_reset_index_inplace(scalars_df_index, scalars_pandas_df_index): bf_result = scalars_df_index.sort_index(ascending=False)["float64_col"] bf_result.reset_index(drop=True, inplace=True) @@ -3577,6 +3603,26 @@ def test_mask_custom_value(scalars_dfs): assert_pandas_df_equal(bf_result, pd_result) +def test_mask_with_callable(scalars_df_index, scalars_pandas_df_index): + def _ten_times(x): + return x * 10 + + # Both cond and other are callable. + bf_result = ( + scalars_df_index["int64_col"] + .mask(cond=lambda x: x > 0, other=_ten_times) + .to_pandas() + ) + pd_result = scalars_pandas_df_index["int64_col"].mask( + cond=lambda x: x > 0, other=_ten_times + ) + + pd.testing.assert_series_equal( + bf_result, + pd_result, + ) + + @pytest.mark.parametrize( ("lambda_",), [ diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_null_match/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_null_match/out.sql new file mode 100644 index 0000000000..90cbcfe5c7 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_null_match/out.sql @@ -0,0 +1,14 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + COALESCE(CAST(`bfcol_1` AS STRING), '$NULL_SENTINEL$') = COALESCE(CAST(CAST(`bfcol_0` AS INT64) AS STRING), '$NULL_SENTINEL$') AS `bfcol_4` + FROM `bfcte_0` +) +SELECT + `bfcol_4` AS `int64_col` +FROM `bfcte_1` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_numeric/out.sql new file mode 100644 index 0000000000..8e3c52310d --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_eq_numeric/out.sql @@ -0,0 +1,54 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + `bfcol_1` = `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_7` = 1 AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_15` = CAST(`bfcol_16` AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + CAST(`bfcol_26` AS INT64) = `bfcol_25` AS `bfcol_42` + FROM `bfcte_3` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `int64_col`, + `bfcol_38` AS `bool_col`, + `bfcol_39` AS `int_ne_int`, + `bfcol_40` AS `int_ne_1`, + `bfcol_41` AS `int_ne_bool`, + `bfcol_42` AS `bool_ne_int` +FROM `bfcte_4` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ge_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ge_numeric/out.sql new file mode 100644 index 0000000000..494cb861a7 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ge_numeric/out.sql @@ -0,0 +1,54 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + `bfcol_1` >= `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_7` >= 1 AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_15` >= CAST(`bfcol_16` AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + CAST(`bfcol_26` AS INT64) >= `bfcol_25` AS `bfcol_42` + FROM `bfcte_3` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `int64_col`, + `bfcol_38` AS `bool_col`, + `bfcol_39` AS `int_ge_int`, + `bfcol_40` AS `int_ge_1`, + `bfcol_41` AS `int_ge_bool`, + `bfcol_42` AS `bool_ge_int` +FROM `bfcte_4` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_gt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_gt_numeric/out.sql new file mode 100644 index 0000000000..b0c8768850 --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_gt_numeric/out.sql @@ -0,0 +1,54 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + `bfcol_1` > `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_7` > 1 AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_15` > CAST(`bfcol_16` AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + CAST(`bfcol_26` AS INT64) > `bfcol_25` AS `bfcol_42` + FROM `bfcte_3` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `int64_col`, + `bfcol_38` AS `bool_col`, + `bfcol_39` AS `int_gt_int`, + `bfcol_40` AS `int_gt_1`, + `bfcol_41` AS `int_gt_bool`, + `bfcol_42` AS `bool_gt_int` +FROM `bfcte_4` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_le_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_le_numeric/out.sql new file mode 100644 index 0000000000..2f642d8cbb --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_le_numeric/out.sql @@ -0,0 +1,54 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + `bfcol_1` <= `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_7` <= 1 AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_15` <= CAST(`bfcol_16` AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + CAST(`bfcol_26` AS INT64) <= `bfcol_25` AS `bfcol_42` + FROM `bfcte_3` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `int64_col`, + `bfcol_38` AS `bool_col`, + `bfcol_39` AS `int_le_int`, + `bfcol_40` AS `int_le_1`, + `bfcol_41` AS `int_le_bool`, + `bfcol_42` AS `bool_le_int` +FROM `bfcte_4` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_lt_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_lt_numeric/out.sql new file mode 100644 index 0000000000..b244e3cbcc --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_lt_numeric/out.sql @@ -0,0 +1,54 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + `bfcol_1` < `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_7` < 1 AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_15` < CAST(`bfcol_16` AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + CAST(`bfcol_26` AS INT64) < `bfcol_25` AS `bfcol_42` + FROM `bfcte_3` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `int64_col`, + `bfcol_38` AS `bool_col`, + `bfcol_39` AS `int_lt_int`, + `bfcol_40` AS `int_lt_1`, + `bfcol_41` AS `int_lt_bool`, + `bfcol_42` AS `bool_lt_int` +FROM `bfcte_4` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ne_numeric/out.sql b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ne_numeric/out.sql new file mode 100644 index 0000000000..6fba4b960f --- /dev/null +++ b/tests/unit/core/compile/sqlglot/expressions/snapshots/test_binary_compiler/test_ne_numeric/out.sql @@ -0,0 +1,54 @@ +WITH `bfcte_0` AS ( + SELECT + `bool_col` AS `bfcol_0`, + `int64_col` AS `bfcol_1`, + `rowindex` AS `bfcol_2` + FROM `bigframes-dev`.`sqlglot_test`.`scalar_types` +), `bfcte_1` AS ( + SELECT + *, + `bfcol_2` AS `bfcol_6`, + `bfcol_1` AS `bfcol_7`, + `bfcol_0` AS `bfcol_8`, + `bfcol_1` <> `bfcol_1` AS `bfcol_9` + FROM `bfcte_0` +), `bfcte_2` AS ( + SELECT + *, + `bfcol_6` AS `bfcol_14`, + `bfcol_7` AS `bfcol_15`, + `bfcol_8` AS `bfcol_16`, + `bfcol_9` AS `bfcol_17`, + `bfcol_7` <> 1 AS `bfcol_18` + FROM `bfcte_1` +), `bfcte_3` AS ( + SELECT + *, + `bfcol_14` AS `bfcol_24`, + `bfcol_15` AS `bfcol_25`, + `bfcol_16` AS `bfcol_26`, + `bfcol_17` AS `bfcol_27`, + `bfcol_18` AS `bfcol_28`, + `bfcol_15` <> CAST(`bfcol_16` AS INT64) AS `bfcol_29` + FROM `bfcte_2` +), `bfcte_4` AS ( + SELECT + *, + `bfcol_24` AS `bfcol_36`, + `bfcol_25` AS `bfcol_37`, + `bfcol_26` AS `bfcol_38`, + `bfcol_27` AS `bfcol_39`, + `bfcol_28` AS `bfcol_40`, + `bfcol_29` AS `bfcol_41`, + CAST(`bfcol_26` AS INT64) <> `bfcol_25` AS `bfcol_42` + FROM `bfcte_3` +) +SELECT + `bfcol_36` AS `rowindex`, + `bfcol_37` AS `int64_col`, + `bfcol_38` AS `bool_col`, + `bfcol_39` AS `int_ne_int`, + `bfcol_40` AS `int_ne_1`, + `bfcol_41` AS `int_ne_bool`, + `bfcol_42` AS `bool_ne_int` +FROM `bfcte_4` \ No newline at end of file diff --git a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py index 49426fe6c3..a2218d0afa 100644 --- a/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py +++ b/tests/unit/core/compile/sqlglot/expressions/test_binary_compiler.py @@ -107,6 +107,24 @@ def test_div_timedelta(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(bf_df.sql, "out.sql") +def test_eq_null_match(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + sql = _apply_binary_op(bf_df, ops.eq_null_match_op, "int64_col", "bool_col") + snapshot.assert_match(sql, "out.sql") + + +def test_eq_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_ne_int"] = bf_df["int64_col"] == bf_df["int64_col"] + bf_df["int_ne_1"] = bf_df["int64_col"] == 1 + + bf_df["int_ne_bool"] = bf_df["int64_col"] == bf_df["bool_col"] + bf_df["bool_ne_int"] = bf_df["bool_col"] == bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col", "float64_col"]] @@ -121,8 +139,6 @@ def test_floordiv_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df["int_div_bool"] = bf_df["int64_col"] // bf_df["bool_col"] bf_df["bool_div_int"] = bf_df["bool_col"] // bf_df["int64_col"] - snapshot.assert_match(bf_df.sql, "out.sql") - def test_floordiv_timedelta(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["timestamp_col", "date_col"]] @@ -133,6 +149,30 @@ def test_floordiv_timedelta(scalar_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(bf_df.sql, "out.sql") +def test_gt_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_gt_int"] = bf_df["int64_col"] > bf_df["int64_col"] + bf_df["int_gt_1"] = bf_df["int64_col"] > 1 + + bf_df["int_gt_bool"] = bf_df["int64_col"] > bf_df["bool_col"] + bf_df["bool_gt_int"] = bf_df["bool_col"] > bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_ge_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_ge_int"] = bf_df["int64_col"] >= bf_df["int64_col"] + bf_df["int_ge_1"] = bf_df["int64_col"] >= 1 + + bf_df["int_ge_bool"] = bf_df["int64_col"] >= bf_df["bool_col"] + bf_df["bool_ge_int"] = bf_df["bool_col"] >= bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + def test_json_set(json_types_df: bpd.DataFrame, snapshot): bf_df = json_types_df[["json_col"]] sql = _apply_binary_op( @@ -142,6 +182,30 @@ def test_json_set(json_types_df: bpd.DataFrame, snapshot): snapshot.assert_match(sql, "out.sql") +def test_lt_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_lt_int"] = bf_df["int64_col"] < bf_df["int64_col"] + bf_df["int_lt_1"] = bf_df["int64_col"] < 1 + + bf_df["int_lt_bool"] = bf_df["int64_col"] < bf_df["bool_col"] + bf_df["bool_lt_int"] = bf_df["bool_col"] < bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + +def test_le_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_le_int"] = bf_df["int64_col"] <= bf_df["int64_col"] + bf_df["int_le_1"] = bf_df["int64_col"] <= 1 + + bf_df["int_le_bool"] = bf_df["int64_col"] <= bf_df["bool_col"] + bf_df["bool_le_int"] = bf_df["bool_col"] <= bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") + + def test_sub_numeric(scalar_types_df: bpd.DataFrame, snapshot): bf_df = scalar_types_df[["int64_col", "bool_col"]] @@ -200,3 +264,15 @@ def test_mul_timedelta(scalar_types_df: bpd.DataFrame, snapshot): def test_obj_make_ref(scalar_types_df: bpd.DataFrame, snapshot): blob_df = scalar_types_df["string_col"].str.to_blob() snapshot.assert_match(blob_df.to_frame().sql, "out.sql") + + +def test_ne_numeric(scalar_types_df: bpd.DataFrame, snapshot): + bf_df = scalar_types_df[["int64_col", "bool_col"]] + + bf_df["int_ne_int"] = bf_df["int64_col"] != bf_df["int64_col"] + bf_df["int_ne_1"] = bf_df["int64_col"] != 1 + + bf_df["int_ne_bool"] = bf_df["int64_col"] != bf_df["bool_col"] + bf_df["bool_ne_int"] = bf_df["bool_col"] != bf_df["int64_col"] + + snapshot.assert_match(bf_df.sql, "out.sql") diff --git a/tests/unit/functions/test_remote_function_utils.py b/tests/unit/functions/test_remote_function_utils.py index 91fe01e986..0e4ca7a2ac 100644 --- a/tests/unit/functions/test_remote_function_utils.py +++ b/tests/unit/functions/test_remote_function_utils.py @@ -21,6 +21,61 @@ from bigframes.functions import _utils, function_typing +@pytest.mark.parametrize( + ("input_location", "expected_bq_location", "expected_cf_region"), + [ + (None, "us", "us-central1"), + ("us", "us", "us-central1"), + ("eu", "eu", "europe-west1"), + ("US-east4", "us-east4", "us-east4"), + ], +) +def test_get_remote_function_locations( + input_location, expected_bq_location, expected_cf_region +): + """Tests getting remote function locations for various locations.""" + bq_location, cf_region = _utils.get_remote_function_locations(input_location) + + assert bq_location == expected_bq_location + assert cf_region == expected_cf_region + + +@pytest.mark.parametrize( + "func_hash, session_id, uniq_suffix, expected_name", + [ + ( + "hash123", + None, + None, + "bigframes-hash123", + ), + ( + "hash456", + "session789", + None, + "bigframes-session789-hash456", + ), + ( + "hash123", + None, + "suffixABC", + "bigframes-hash123-suffixABC", + ), + ( + "hash456", + "session789", + "suffixDEF", + "bigframes-session789-hash456-suffixDEF", + ), + ], +) +def test_get_cloud_function_name(func_hash, session_id, uniq_suffix, expected_name): + """Tests the construction of the cloud function name from its parts.""" + result = _utils.get_cloud_function_name(func_hash, session_id, uniq_suffix) + + assert result == expected_name + + def test_get_updated_package_requirements_no_extra_package(): """Tests with no extra package.""" result = _utils.get_updated_package_requirements(capture_references=False) diff --git a/third_party/bigframes_vendored/pandas/core/frame.py b/third_party/bigframes_vendored/pandas/core/frame.py index 00984935a4..44ca558070 100644 --- a/third_party/bigframes_vendored/pandas/core/frame.py +++ b/third_party/bigframes_vendored/pandas/core/frame.py @@ -1605,6 +1605,10 @@ def reset_index( *, drop: bool = False, inplace: bool = False, + col_level: Hashable = 0, + col_fill: Hashable = "", + allow_duplicates: Optional[bool] = None, + names: Hashable | Sequence[Hashable] | None = None, ) -> DataFrame | None: """Reset the index. @@ -1706,6 +1710,19 @@ class name speed max the index to the default integer index. inplace (bool, default False): Whether to modify the DataFrame rather than creating a new one. + col_level (int or str, default 0): + If the columns have multiple levels, determines which level the + labels are inserted into. By default it is inserted into the first + level. + col_fill (object, default ''): + If the columns have multiple levels, determines how the other + levels are named. If None then the index name is repeated. + allow_duplicates (bool, optional, default None): + Allow duplicate column labels to be created. + names (str or 1-dimensional list, default None): + Using the given string, rename the DataFrame column which contains the + index data. If the DataFrame has a MultiIndex, this has to be a list or + tuple with length equal to the number of levels Returns: bigframes.pandas.DataFrame: DataFrame with the new index. diff --git a/third_party/bigframes_vendored/pandas/core/series.py b/third_party/bigframes_vendored/pandas/core/series.py index 7b420cf6e3..932959a826 100644 --- a/third_party/bigframes_vendored/pandas/core/series.py +++ b/third_party/bigframes_vendored/pandas/core/series.py @@ -326,6 +326,7 @@ def reset_index( drop: bool = False, name=pd_ext.no_default, inplace: bool = False, + allow_duplicates: Optional[bool] = None, ) -> DataFrame | Series | None: """ Generate a new DataFrame or Series with the index reset. @@ -413,6 +414,8 @@ def reset_index( when `drop` is True. inplace (bool, default False): Modify the Series in place (do not create a new object). + allow_duplicates (bool, optional, default None): + Allow duplicate column labels to be created. Returns: bigframes.pandas.Series or bigframes.pandas.DataFrame or None: diff --git a/third_party/bigframes_vendored/version.py b/third_party/bigframes_vendored/version.py index 6b84e2eb1d..b9aa5d1855 100644 --- a/third_party/bigframes_vendored/version.py +++ b/third_party/bigframes_vendored/version.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "2.16.0" +__version__ = "2.17.0" # {x-release-please-start-date} -__release_date__ = "2025-08-20" +__release_date__ = "2025-08-22" # {x-release-please-end}