Skip to content

Commit 1f6205e

Browse files
fix(bigframes): include pyopenssl as a dependency (#17362)
Also, support pandas 3.0 in various system tests. Internal issue b/519591816 🦕 --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent f9ff3b1 commit 1f6205e

23 files changed

Lines changed: 183 additions & 89 deletions

File tree

packages/bigframes/bigframes/bigquery/_operations/ai.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def generate(
6161
>>> import bigframes.pandas as bpd
6262
>>> import bigframes.bigquery as bbq
6363
>>> country = bpd.Series(["Japan", "Canada"])
64-
>>> bbq.ai.generate(("What's the capital city of ", country, " one word only"))
64+
>>> bbq.ai.generate(("What's the capital city of ", country, " one word only")) # doctest: +ELLIPSIS
6565
0 {'result': 'Tokyo', 'full_response': '{"cand...
6666
1 {'result': 'Ottawa', 'full_response': '{"can...
6767
dtype: struct<result: string, full_response: extension<dbjson<JSONArrowType>>, status: string>[pyarrow]
@@ -231,8 +231,8 @@ def generate_int(
231231
232232
>>> import bigframes.pandas as bpd
233233
>>> import bigframes.bigquery as bbq
234-
>>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"])
235-
>>> bbq.ai.generate_int(("How many legs does a ", animal, " have?"))
234+
>>> animal = bpd.Series(["Ostrich", "Rabbit", "Spider"])
235+
>>> bbq.ai.generate_int(("How many legs does a ", animal, " have?")) # doctest: +ELLIPSIS
236236
0 {'result': 2, 'full_response': '{"candidates":...
237237
1 {'result': 4, 'full_response': '{"candidates":...
238238
2 {'result': 8, 'full_response': '{"candidates":...
@@ -305,8 +305,8 @@ def generate_double(
305305
306306
>>> import bigframes.pandas as bpd
307307
>>> import bigframes.bigquery as bbq
308-
>>> animal = bpd.Series(["Kangaroo", "Rabbit", "Spider"])
309-
>>> bbq.ai.generate_double(("How many legs does a ", animal, " have?"))
308+
>>> animal = bpd.Series(["Ostrich", "Rabbit", "Spider"])
309+
>>> bbq.ai.generate_double(("How many legs does a ", animal, " have?")) # doctest: +ELLIPSIS
310310
0 {'result': 2.0, 'full_response': '{"candidates...
311311
1 {'result': 4.0, 'full_response': '{"candidates...
312312
2 {'result': 8.0, 'full_response': '{"candidates...
@@ -383,7 +383,7 @@ def generate_embedding(
383383
>>> import bigframes.pandas as bpd
384384
>>> import bigframes.bigquery as bbq
385385
>>> df = bpd.DataFrame({"content": ["apple", "bear", "pear"]})
386-
>>> bbq.ai.generate_embedding(
386+
>>> bbq.ai.generate_embedding( # doctest: +SKIP
387387
... "project.dataset.model_name",
388388
... df
389389
... )
@@ -486,7 +486,7 @@ def generate_text(
486486
>>> import bigframes.pandas as bpd
487487
>>> import bigframes.bigquery as bbq
488488
>>> df = bpd.DataFrame({"prompt": ["write a poem about apples"]})
489-
>>> bbq.ai.generate_text(
489+
>>> bbq.ai.generate_text( # doctest: +SKIP
490490
... "project.dataset.model_name",
491491
... df
492492
... )
@@ -601,7 +601,7 @@ def generate_table(
601601
>>> # the necessary columns for the model's prompt. For example, a
602602
>>> # DataFrame with a 'prompt' column for text classification.
603603
>>> df = bpd.DataFrame({'prompt': ["some text to classify"]})
604-
>>> result = bbq.ai.generate_table(
604+
>>> result = bbq.ai.generate_table( # doctest: +SKIP
605605
... "project.dataset.model_name",
606606
... data=df,
607607
... output_schema="category STRING"
@@ -708,12 +708,14 @@ def embed(
708708
709709
>>> import bigframes.pandas as bpd
710710
>>> import bigframes.bigquery as bbq
711-
>>> bbq.ai.embed("dog", endpoint="text-embedding-005")
711+
>>> bbq.ai.embed("dog", endpoint="text-embedding-005") # doctest: +ELLIPSIS
712712
0 {'result': array([ 1.78243860e-03, -1.10658340...
713+
dtype: struct<result: list<item: double>, status: string>[pyarrow]
713714
714715
>>> s = bpd.Series(['dog'])
715-
>>> bbq.ai.embed(s, endpoint='text-embedding-005')
716+
>>> bbq.ai.embed(s, endpoint='text-embedding-005') # doctest: +ELLIPSIS
716717
0 {'result': array([ 1.78243860e-03, -1.10658340...
718+
dtype: struct<result: list<item: double>, status: string>[pyarrow]
717719
718720
Args:
719721
content (str | Series):
@@ -1004,6 +1006,7 @@ def similarity(
10041006
>>> bbq.ai.similarity(df['word'], 'glad', endpoint='text-embedding-005')
10051007
0 0.916601
10061008
1 0.660579
1009+
Name: word, dtype: Float64
10071010
10081011
Args:
10091012
content1 (str | Series):
@@ -1082,8 +1085,8 @@ def forecast(
10821085
>>> df = pd.DataFrame({"value": [1, 2, 3], "time": pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-03"])})
10831086
>>> bpd.options.display.progress_bar = None
10841087
>>> forecasted_pandas_df = df.bigquery.ai.forecast(data_col="value", timestamp_col="time", horizon=2)
1085-
>>> type(forecasted_pandas_df)
1086-
<class 'pandas.core.frame.DataFrame'>
1088+
>>> type(forecasted_pandas_df) # doctest: +ELLIPSIS
1089+
<class 'pandas...DataFrame'>
10871090
10881091
Forecast using a BigFrames DataFrame:
10891092

packages/bigframes/bigframes/bigquery/_operations/struct.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,5 @@ def struct(value: dataframe.DataFrame) -> series.Series:
5757
block, result_id = block.apply_nary_op(
5858
block.value_columns, ops.StructOp(column_names=tuple(block.column_labels))
5959
)
60-
block = block.select_column(result_id)
60+
block = block.select_column(result_id).with_column_labels([None])
6161
return series.Series(block)

packages/bigframes/bigframes/core/blocks.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1991,6 +1991,10 @@ def _generate_resample_label(
19911991
)
19921992
level = level or 0
19931993
col_id = self.index.resolve_level(level)[0]
1994+
if isinstance(level, int):
1995+
resample_label = self.index.names[level]
1996+
else:
1997+
resample_label = level
19941998
# Reset index to make the resampling level a column, then drop all other index columns.
19951999
# This simplifies processing by focusing solely on the column required for resampling.
19962000
block = self.reset_index(drop=False)
@@ -2009,6 +2013,7 @@ def _generate_resample_label(
20092013
raise KeyError(f"The grouper name {on} is not found")
20102014

20112015
col_id = matches[0]
2016+
resample_label = on
20122017
block = self
20132018
if level is None:
20142019
dtype = self._column_type(col_id)
@@ -2101,6 +2106,7 @@ def _generate_resample_label(
21012106
block.value_columns[0],
21022107
block.value_columns[1],
21032108
op=ops.IntegerLabelToDatetimeOp(freq=freq, label=label, origin=origin),
2109+
result_label=resample_label,
21042110
)
21052111

21062112
# After multiple merges, the columns:

packages/bigframes/bigframes/core/compile/polars/compiler.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,59 @@ def _(
178178
self,
179179
expression: ex.OpExpression,
180180
) -> pl.Expr:
181-
# TODO: Complete the implementation
181+
import datetime
182+
183+
import pyarrow as pa
184+
182185
op = expression.op
186+
187+
# Polars panics on nulls from pandas objects in timezone-aware
188+
# datetimes for certain ops. Convert to timezone-naive temporarily
189+
# to avoid this issue.
190+
# TODO(tswast): Remove workaround when
191+
# https://github.com/pola-rs/polars/issues/27862 has been fixed.
192+
is_problematic_op = type(op) in (
193+
date_ops.YearOp,
194+
date_ops.QuarterOp,
195+
date_ops.MonthOp,
196+
date_ops.DayOp,
197+
date_ops.IsoWeekOp,
198+
)
199+
200+
if is_problematic_op and len(expression.inputs) == 1:
201+
input_expr = expression.inputs[0]
202+
if (
203+
input_expr.is_resolved
204+
and isinstance(input_expr.output_type, pd.ArrowDtype)
205+
and isinstance(
206+
input_expr.output_type.pyarrow_dtype, pa.TimestampType
207+
)
208+
and input_expr.output_type.pyarrow_dtype.tz is not None
209+
):
210+
tz_str = input_expr.output_type.pyarrow_dtype.tz
211+
if tz_str == "UTC":
212+
dummy_tz = datetime.timezone.utc
213+
else:
214+
try:
215+
from zoneinfo import ZoneInfo
216+
217+
dummy_tz = ZoneInfo(tz_str) # type: ignore
218+
except Exception:
219+
dummy_tz = datetime.timezone.utc
220+
221+
dummy_val = datetime.datetime(1970, 1, 1, tzinfo=dummy_tz)
222+
223+
compiled_input = self.compile_expression(input_expr)
224+
filled_input = compiled_input.fill_null(dummy_val)
225+
compiled_op_with_fill = self.compile_op(op, filled_input)
226+
227+
return (
228+
pl.when(compiled_input.is_null())
229+
.then(None)
230+
.otherwise(compiled_op_with_fill)
231+
)
232+
233+
# TODO: Complete the implementation
183234
args = tuple(map(self.compile_expression, expression.inputs))
184235
return self.compile_op(op, *args)
185236

packages/bigframes/bigframes/core/indexes/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,7 @@ def get_loc(self, key) -> typing.Union[int, slice, "bigframes.series.Series"]:
325325
# Return boolean mask for non-monotonic duplicates
326326
mask_block = block_with_offsets.select_columns([match_col_id])
327327
mask_block = mask_block.reset_index(drop=True)
328+
mask_block = mask_block.with_column_labels([None])
328329
result_series = bigframes.series.Series(mask_block)
329330
return result_series.astype("boolean")
330331

packages/bigframes/bigframes/operations/ai.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,10 @@ def map(
122122
>>> model = llm.GeminiTextGenerator(model_name="gemini-2.5-pro")
123123
124124
>>> df = bpd.DataFrame({"ingredient_1": ["Burger Bun", "Soy Bean"], "ingredient_2": ["Beef Patty", "Bittern"]})
125-
>>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"})
126-
ingredient_1 ingredient_2 food
127-
0 Burger Bun Beef Patty Burger
128-
<BLANKLINE>
129-
1 Soy Bean Bittern Tofu
130-
<BLANKLINE>
125+
>>> df.ai.map("What is the food made from {ingredient_1} and {ingredient_2}? One word only.", model=model, output_schema={"food": "string"}) # doctest: +ELLIPSIS
126+
ingredient_1 ingredient_2...
127+
0 Burger Bun Beef Patty...
128+
1 Soy Bean Bittern...Tofu
131129
<BLANKLINE>
132130
[2 rows x 3 columns]
133131

packages/bigframes/bigframes/series.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2472,7 +2472,9 @@ def map(
24722472

24732473
self_df = self.to_frame(name="series")
24742474
result_df = self_df.join(map_df, on="series")
2475-
return result_df[self.name]
2475+
result = cast(Series, result_df[self.name])
2476+
result.name = self.name
2477+
return result
24762478

24772479
@validations.requires_ordering()
24782480
def sample(
@@ -2698,7 +2700,7 @@ def _apply_nary_op(
26982700
others, ignore_self=ignore_self, cast_scalars=False
26992701
)
27002702
block, result_id = block.project_expr(op.as_expr(*values))
2701-
return Series(block.select_column(result_id))
2703+
return Series(block.select_column(result_id).with_column_labels([None]))
27022704

27032705
def _apply_binary_aggregation(
27042706
self, other: Series, stat: agg_ops.BinaryAggregateOp

packages/bigframes/bigframes/testing/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,14 @@ def assert_series_equivalent(pd_series: pd.Series, bf_series: bpd.Series, **kwar
9393
def _normalize_all_nulls(col: pd.Series) -> pd.Series:
9494
if pd_types.is_float_dtype(col.dtype):
9595
col = col.astype("float64").astype("Float64")
96+
elif col.dtype == "object":
97+
if any(isinstance(x, decimal.Decimal) for x in col):
98+
pass
99+
else:
100+
try:
101+
col = col.astype("Float64")
102+
except (TypeError, ValueError, SystemError):
103+
pass
96104
return col
97105

98106

packages/bigframes/setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
"fsspec >=2023.3.0",
3939
"gcsfs >=2023.3.0, !=2025.5.0, !=2026.2.0, !=2026.3.0",
4040
"geopandas >=0.12.2",
41-
"google-auth >=2.15.0,<3.0",
41+
"google-auth[pyopenssl] >=2.15.0,<3.0",
4242
"google-cloud-bigquery[bqstorage,pandas] >=3.36.0",
4343
# 2.30 needed for arrow support.
4444
"google-cloud-bigquery-storage >= 2.30.0, < 3.0.0",
@@ -75,6 +75,7 @@
7575
"pytest-snapshot",
7676
"google-cloud-bigtable >=2.24.0",
7777
"google-cloud-pubsub >=2.21.4",
78+
"tzdata",
7879
],
7980
# used for local engine
8081
"polars": ["polars >= 1.21.0"],

packages/bigframes/tests/system/small/functions/test_remote_function.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,12 @@ def add_one(x):
468468

469469
pd_int64_df = scalars_pandas_df[int64_cols]
470470
pd_int64_df_filtered = pd_int64_df.dropna()
471-
pd_result = pd_int64_df_filtered.applymap(add_one)
471+
472+
# TODO(swast): Remove when pandas 2.1.x+ is the minimum supported.
473+
if hasattr(pd_int64_df_filtered, "map"):
474+
pd_result = pd_int64_df_filtered.map(add_one)
475+
else:
476+
pd_result = pd_int64_df_filtered.applymap(add_one)
472477
# TODO(shobs): Figure why pandas .applymap() changes the dtype, i.e.
473478
# pd_int64_df_filtered.dtype is Int64Dtype()
474479
# pd_int64_df_filtered.applymap(lambda x: x).dtype is int64.
@@ -503,7 +508,13 @@ def add_one(x):
503508

504509
pd_int64_df = scalars_pandas_df[int64_cols]
505510
pd_int64_df_filtered = pd_int64_df[pd_int64_df["int64_col"].notnull()]
506-
pd_result = pd_int64_df_filtered.applymap(add_one)
511+
512+
# TODO(swast): Remove when pandas 2.1.x+ is the minimum supported.
513+
if hasattr(pd_int64_df_filtered, "map"):
514+
pd_result = pd_int64_df_filtered.map(add_one)
515+
else:
516+
pd_result = pd_int64_df_filtered.applymap(add_one)
517+
507518
# TODO(shobs): Figure why pandas .applymap() changes the dtype, i.e.
508519
# pd_int64_df_filtered.dtype is Int64Dtype()
509520
# pd_int64_df_filtered.applymap(lambda x: x).dtype is int64.
@@ -536,7 +547,13 @@ def add_one(x):
536547
bf_result = bf_int64_df.applymap(remote_add_one, na_action="ignore").to_pandas()
537548

538549
pd_int64_df = scalars_pandas_df[int64_cols]
539-
pd_result = pd_int64_df.applymap(add_one, na_action="ignore")
550+
551+
# TODO(swast): Remove when pandas 2.1.x+ is the minimum supported.
552+
if hasattr(pd_int64_df, "map"):
553+
pd_result = pd_int64_df.map(add_one, na_action="ignore")
554+
else:
555+
pd_result = pd_int64_df.applymap(add_one, na_action="ignore")
556+
540557
# TODO(shobs): Figure why pandas .applymap() changes the dtype, i.e.
541558
# pd_int64_df_filtered.dtype is Int64Dtype()
542559
# pd_int64_df_filtered.applymap(lambda x: x).dtype is int64.

0 commit comments

Comments
 (0)