Slide 97
Slide 97 text
import logging
import re
import threading
import apache_beam as beam
from apache_beam.io.gcp import gcsio
from apache_beam.options import pipeline_options
import mixer
import track_storage
class MixerDoFn(beam.DoFn):
PROJECT = "sigint"
GCS_BUCKET = "sigint-output"
GCS_OBJECT_PATH = "automixer-beam"
OUTPUT_NAME_TPL = "{track1_id}-{track2_id}-mix.ogg"
GCS_OUTPUT_TPL = "gs://{bucket}/{object_path}/{filename}"
_thread_local = threading.local()
@property
def gcs_client(self):
client = getattr(self._thread_local, "gcs_client", None)
if not client:
self._thread_local.gcs_client = gcsio.GcsIO()
return self._thread_local.gcs_client
def process(self, entity_ids):
track1_id, track2_id = entity_ids.decode("utf-8").split(",")
output_filename = MixerDoFn.OUTPUT_NAME_TPL.format(
track1_id=track1_id, track2_id=track2_id
)
gcs_output_path = MixerDoFn.GCS_OUTPUT_TPL.format(
bucket=MixerDoFn.GCS_BUCKET,
object_path=MixerDoFn.GCS_OBJECT_PATH,
filename=output_filename,
)
# Check if output already exists:
if self.gcs_client.exists(gcs_output_path):
# Don't do unnecessary work
logging.info(
"Mix for {} & {} already exists: {}".format(
track1_id, track2_id, gcs_output_path
)
)
return
# Check if input data is available
err_msg = "Input for {track} is not available: {e}"
try:
track1_input_path = track_storage.download_track(track1_id)
except Exception as e:
logging.error(err_msg.format(track=track1_id, e=e))
return
try:
track2_input_path = track_storage.download_track(track2_id)
except Exception as e:
logging.error(err_msg.format(track=track2_id, e=e))
return
# Get input track ids
track1 = mixer.Track(track1_id, track1_input_path)
track2 = mixer.Track(track2_id, track2_input_path)
# Mix tracks & save to output file
mixer.mix(track1, track2, output_filename)
# Upload mix
logging.info("Uploading mix to {}".format(gcs_output_path))
with self.gcs_client.open(gcs_output_path, "wb", mime_type="application/octet-stream") as dest:
with open(output_filename, "rb") as source:
dest.write(source.read())
yield entity_ids
def run():
input_subscription = "projects/sigint/subscriptions/automixer-klio-input-automixer-klio"
output_topic = "projects/sigint/topics/automixer-klio-output"
options = pipeline_options.PipelineOptions()
gcp_opts = options.view_as(pipeline_options.GoogleCloudOptions)
gcp_opts.job_name = "automixer-beam"
gcp_opts.project = "sigint"
gcp_opts.region = "europe-west1"
gcp_opts.temp_location = "gs://sigint-dataflow-tmp/automixer-beam/temp"
gcp_opts.staging_location = "gs://sigint-dataflow-tmp/automixer-beam/staging"
worker_opts = options.view_as(pipeline_options.WorkerOptions)
worker_opts.subnetwork = “https://www.googleapis.com/compute/v1/projects/some-network/regions/europe-west1/subnetworks/foo1"
worker_opts.machine_type = "n1-standard-2"
worker_opts.disk_size_gb = 32
worker_opts.num_workers = 2
worker_opts.max_num_workers = 2
worker_opts.worker_harness_container_image = "gcr.io/sigint/automixer-worker-beam:1"
standard_opts = options.view_as(pipeline_options.StandardOptions)
standard_opts.streaming = True
standard_opts.runner = "dataflow"
debug_opts = options.view_as(pipeline_options.DebugOptions)
debug_opts.experiments = ["beam_fn_api"]
options.view_as(pipeline_options.SetupOptions).save_main_session = True
logging.info("Launching pipeline...")
pipeline = beam.Pipeline(options=options)
(pipeline | beam.io.ReadFromPubSub(subscription=input_subscription)
| beam.ParDo(MixerDoFn())
| beam.io.WriteToPubSub(output_topic))
result = pipeline.run()
result.wait_until_finish()
if __name__ == "__main__":
fmt = '%(asctime)s %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)
run()