Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def spark_streaming_to_pubsublite(
) -> None:
# [START pubsublite_spark_streaming_to_pubsublite]
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, create_map, col, lit, when
from pyspark.sql.types import BinaryType, StringType
import uuid

Expand All @@ -35,13 +36,32 @@ def spark_streaming_to_pubsublite(
# |-- value: long (nullable = true)
sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

# Transform the dataframe to match the required data fields and data types:
# https://github.com/googleapis/java-pubsublite-spark#data-schema
Comment thread
anguillanneuf marked this conversation as resolved.
sdf = (
sdf.withColumn("key", (sdf.value % 5).cast(StringType()).cast(BinaryType()))
.withColumn("event_timestamp", sdf.timestamp)
.withColumn("data", sdf.value.cast(StringType()).cast(BinaryType()))
.drop("value", "timestamp")
sdf.withColumn("key", lit("example").cast(BinaryType()))
.withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
.withColumnRenamed("timestamp", "event_timestamp")
# Populate the attributes field. For example, an even value will
# have {"key1", [b"even"]}.
.withColumn(
"attributes",
create_map(
lit("key1"),
array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
),
)
.drop("value")
)

# After the transformation, the schema of the dataframe should look like:
# |-- key: binary (nullable = false)
# |-- data: binary (nullable = true)
# |-- event_timestamp: timestamp (nullable = true)
# |-- attributes: map (nullable = false)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our guide and everywhere actually specifies attributes is nullable.

Copy link
Copy Markdown

@jiangmichaellll jiangmichaellll Jan 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but our code doesn't really check since they are compatible ie our codebase is ok with nullable, I think it's fine.

Copy link
Copy Markdown
Member Author

@anguillanneuf anguillanneuf Jan 28, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jiangmichaellll Yes, all the fields can be nulls. It's just we are printing out of the dataframe created by the code above.

# | |-- key: string
# | |-- value: array (valueContainsNull = false)
# | | |-- element: binary (containsNull = false)
sdf.printSchema()

query = (
Expand Down