Learning Spark by Holden Karau O'Reilly [推广有奖]

31楼

ReneeBK(未真实交易用户) 发表于 2016-9-26 00:12:00

/**
* Illustrates a simple map partition to parse CSV data in Scala
*/
package com.oreilly.learningsparkexamples.scala
import java.io.StringReader
import java.io.StringWriter
import org.apache.spark._
import play.api.libs.json._
import play.api.libs.functional.syntax._
import scala.util.parsing.json.JSON
import scala.collection.JavaConversions._
import au.com.bytecode.opencsv.CSVReader
import au.com.bytecode.opencsv.CSVWriter
object BasicParseCsv {
case class Person(name: String, favouriteAnimal: String)
def main(args: Array[String]) {
if (args.length < 3) {
println("Usage: [sparkmaster] [inputfile] [outputfile]")
exit(1)
}
val master = args(0)
val inputFile = args(1)
val outputFile = args(2)
val sc = new SparkContext(master, "BasicParseCsv", System.getenv("SPARK_HOME"))
val input = sc.textFile(inputFile)
val result = input.map{ line =>
val reader = new CSVReader(new StringReader(line));
reader.readNext();
}
val people = result.map(x => Person(x(0), x(1)))
val pandaLovers = people.filter(person => person.favouriteAnimal == "panda")
pandaLovers.map(person => List(person.name, person.favouriteAnimal).toArray).mapPartitions{people =>
val stringWriter = new StringWriter();
val csvWriter = new CSVWriter(stringWriter);
csvWriter.writeAll(people.toList)
Iterator(stringWriter.toString)
}.saveAsTextFile(outputFile)
}
}

复制代码

32楼

ReneeBK(未真实交易用户) 发表于 2016-9-26 00:12:39

/**
* Illustrates a simple map partition to parse JSON data in Scala
* Loads the data into a case class with the name and a boolean flag
* if the person loves pandas.
*/
package com.oreilly.learningsparkexamples.scala
import org.apache.spark._
import play.api.libs.json._
import play.api.libs.functional.syntax._
object BasicParseJson {
case class Person(name: String, lovesPandas: Boolean)
implicit val personReads = Json.format[Person]
def main(args: Array[String]) {
if (args.length < 3) {
println("Usage: [sparkmaster] [inputfile] [outputfile]")
exit(1)
}
val master = args(0)
val inputFile = args(1)
val outputFile = args(2)
val sc = new SparkContext(master, "BasicParseJson", System.getenv("SPARK_HOME"))
val input = sc.textFile(inputFile)
val parsed = input.map(Json.parse(_))
// We use asOpt combined with flatMap so that if it fails to parse we
// get back a None and the flatMap essentially skips the result.
val result = parsed.flatMap(record => personReads.reads(record).asOpt)
result.filter(_.lovesPandas).map(Json.toJson(_)).saveAsTextFile(outputFile)
}
}

复制代码

33楼

ReneeBK(未真实交易用户) 发表于 2016-9-26 00:17:00

/**
* Illustrates a simple map partition to parse JSON data in Scala
* Loads the data into a case class with the name and a boolean flag
* if the person loves pandas.
*/
package com.oreilly.learningsparkexamples.scala
import org.apache.spark._
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.databind.DeserializationFeature
case class Person(name: String, lovesPandas: Boolean) // Note: must be a top level class
object BasicParseJsonWithJackson {
def main(args: Array[String]) {
if (args.length < 3) {
println("Usage: [sparkmaster] [inputfile] [outputfile]")
exit(1)
}
val master = args(0)
val inputFile = args(1)
val outputFile = args(2)
val sc = new SparkContext(master, "BasicParseJsonWithJackson", System.getenv("SPARK_HOME"))
val input = sc.textFile(inputFile)
// Parse it into a specific case class. We use mapPartitions beacuse:
// (a) ObjectMapper is not serializable so we either create a singleton object encapsulating ObjectMapper
// on the driver and have to send data back to the driver to go through the singleton object.
// Alternatively we can let each node create its own ObjectMapper but that's expensive in a map
// (b) To solve for creating an ObjectMapper on each node without being too expensive we create one per
// partition with mapPartitions. Solves serialization and object creation performance hit.
val result = input.mapPartitions(records => {
// mapper object created on each executor node
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
mapper.registerModule(DefaultScalaModule)
// We use flatMap to handle errors
// by returning an empty list (None) if we encounter an issue and a
// list with one element if everything is ok (Some(_)).
records.flatMap(record => {
try {
Some(mapper.readValue(record, classOf[Person]))
} catch {
case e: Exception => None
}
})
}, true)
result.filter(_.lovesPandas).mapPartitions(records => {
val mapper = new ObjectMapper with ScalaObjectMapper
mapper.registerModule(DefaultScalaModule)
records.map(mapper.writeValueAsString(_))
})
.saveAsTextFile(outputFile)
}
}

复制代码

34楼

ReneeBK(未真实交易用户) 发表于 2016-9-26 00:17:35

/**
* Illustrates a simple map partition to parse CSV data in Scala
*/
package com.oreilly.learningsparkexamples.scala
import java.io.StringReader
import org.apache.spark._
import play.api.libs.json._
import play.api.libs.functional.syntax._
import scala.util.parsing.json.JSON
import scala.collection.JavaConversions._
import au.com.bytecode.opencsv.CSVReader
object BasicParseWholeFileCsv {
def main(args: Array[String]) {
if (args.length < 2) {
println("Usage: [sparkmaster] [inputfile]")
exit(1)
}
val master = args(0)
val inputFile = args(1)
val sc = new SparkContext(master, "BasicParseWholeFileCsv", System.getenv("SPARK_HOME"))
val input = sc.wholeTextFiles(inputFile)
val result = input.flatMap{ case (_, txt) =>
val reader = new CSVReader(new StringReader(txt));
reader.readAll()
}
println(result.collect().map(_.toList).mkString(","))
}
}