Slide 42
Slide 42 text
The Ugly Part
sc.textFile("s3://bucket/*").flatMap(_.split("[^\\p{L}]+"))
Spark
p.apply(TextIO.read().from("gs://bucket/*"))
.apply("ExtractWords", FlatMapElements.into(TypeDescriptors.strings())
.via((String word) -> Arrays.asList(word.split("[^\\p{L}]+"))))
Beam Java
(p | beam.io.ReadFromText('gs://bucket/*')
| 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x)))
Beam Python
lines := textio.Read(s, "gs://bucket/*")
words := beam.ParDo(s, func(line string, emit func(string)) {
for _, word := range wordRE.FindAllString(line, -1) { emit(word) }
}, lines)
Beam Go