Skip to content

Commit 6b95400

Browse files
committed
repro
1 parent 87eb8a0 commit 6b95400

File tree

4 files changed

+29
-76
lines changed

4 files changed

+29
-76
lines changed

dataflow/flex-templates/streaming_beam/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ WORKDIR ${WORKDIR}
2121
COPY requirements.txt .
2222
COPY streaming_beam.py .
2323

24-
ENV FLEX_TEMPLATE_PYTHON_REQUIREMENTS_FILE="${WORKDIR}/requirements.txt"
2524
ENV FLEX_TEMPLATE_PYTHON_PY_FILE="${WORKDIR}/streaming_beam.py"
2625

27-
RUN pip install -U -r ./requirements.txt
26+
RUN pip install matplotlib
27+
RUN pip install apache-beam[gcp]==2.21.0
28+
RUN pip install pip==9.0.3
Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,15 @@
11
{
2-
"name": "Streaming beam Python flex template",
3-
"description": "Streaming beam example for python flex template.",
2+
"name": "Batch beam Python flex template",
3+
"description": "Batch beam example for python flex template.",
44
"parameters": [
55
{
6-
"name": "input_subscription",
7-
"label": "Input PubSub subscription.",
8-
"helpText": "Name of the input PubSub subscription to consume from.",
9-
"regexes": [
10-
"[/:-_.a-zA-Z0-9]+"
11-
]
12-
},
13-
{
14-
"name": "output_table",
15-
"label": "BigQuery output table name.",
16-
"helpText": "Name of the BigQuery output table name.",
6+
"name": "output_text",
7+
"label": "Output text location",
8+
"helpText": "Path to output text location",
179
"is_optional": true,
1810
"regexes": [
19-
"[/:-_.a-zA-Z0-9]+"
11+
".*"
2012
]
2113
}
2214
]
23-
}
15+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
apache-beam[gcp]==2.21.0
2+
matplotlib

dataflow/flex-templates/streaming_beam/streaming_beam.py

Lines changed: 18 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -25,81 +25,40 @@
2525
import logging
2626
import time
2727

28+
# Unused dependency used to replicate bug.
29+
import matplotlib
30+
import numpy as np
31+
2832
import apache_beam as beam
2933
from apache_beam.options.pipeline_options import PipelineOptions
3034
import apache_beam.transforms.window as window
3135

32-
# Defines the BigQuery schema for the output table.
33-
SCHEMA = ','.join([
34-
'url:STRING',
35-
'num_reviews:INTEGER',
36-
'score:FLOAT64',
37-
'first_date:TIMESTAMP',
38-
'last_date:TIMESTAMP',
39-
])
40-
41-
42-
def parse_json_message(message):
43-
"""Parse the input json message and add 'score' & 'processing_time' keys."""
44-
row = json.loads(message)
45-
return {
46-
'url': row['url'],
47-
'score': 1.0 if row['review'] == 'positive' else 0.0,
48-
'processing_time': int(time.time()),
49-
}
50-
5136

52-
def get_statistics(url_messages):
53-
"""Get statistics from the input URL messages."""
54-
url, messages = url_messages
55-
return {
56-
'url': url,
57-
'num_reviews': len(messages),
58-
'score': sum(msg['score'] for msg in messages) / len(messages),
59-
'first_date': min(msg['processing_time'] for msg in messages),
60-
'last_date': max(msg['processing_time'] for msg in messages),
61-
}
37+
def useless_numpy_function(x):
38+
return str(np.array(x))
6239

6340

64-
def run(args, input_subscription, output_table, window_interval):
41+
def run(args, output_text):
6542
"""Build and run the pipeline."""
66-
options = PipelineOptions(args, save_main_session=True, streaming=True)
43+
options = PipelineOptions(args, save_main_session=True)
6744

6845
with beam.Pipeline(options=options) as pipeline:
6946

7047
# Read the messages from PubSub and process them.
71-
messages = (
48+
_ = (
7249
pipeline
73-
| 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
74-
subscription=input_subscription).with_output_types(bytes)
75-
| 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
76-
| 'Parse JSON messages' >> beam.Map(parse_json_message)
77-
| 'Fixed-size windows' >> beam.WindowInto(
78-
window.FixedWindows(int(window_interval), 0))
79-
| 'Add URL keys' >> beam.Map(lambda msg: (msg['url'], msg))
80-
| 'Group by URLs' >> beam.GroupByKey()
81-
| 'Get statistics' >> beam.Map(get_statistics))
50+
| "Create tiny collection" >> beam.Create(["a", "b", "c"])
51+
| "Useless Numpy Function" >> beam.Map(useless_numpy_function)
52+
| "Write output" >> beam.io.Write(beam.io.WriteToText(output_text))
53+
)
8254

83-
# Output the results into BigQuery table.
84-
_ = messages | 'Write to Big Query' >> beam.io.WriteToBigQuery(
85-
output_table, schema=SCHEMA)
8655

87-
88-
if __name__ == '__main__':
56+
if __name__ == "__main__":
8957
logging.getLogger().setLevel(logging.INFO)
9058
parser = argparse.ArgumentParser()
9159
parser.add_argument(
92-
'--output_table',
93-
help='Output BigQuery table for results specified as: '
94-
'PROJECT:DATASET.TABLE or DATASET.TABLE.')
95-
parser.add_argument(
96-
'--input_subscription',
97-
help='Input PubSub subscription of the form '
98-
'"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')
99-
parser.add_argument(
100-
'--window_interval',
101-
default=60,
102-
help='Window interval in seconds for grouping incoming messages.')
60+
"--output_text", help="Path to output location (should be in a bucket)"
61+
)
62+
10363
known_args, pipeline_args = parser.parse_known_args()
104-
run(pipeline_args, known_args.input_subscription, known_args.output_table,
105-
known_args.window_interval)
64+
run(pipeline_args, known_args.output_text)

0 commit comments

Comments
 (0)