经管之家送您一份
应届毕业生专属福利!
求职就业群
感谢您参与论坛问题回答
经管之家送您两个论坛币!
+2 论坛币
- from_future__ import print_function
- from pyspark import SparkContext
- from pyspark.ml import Pipeline
- from pyspark.ml.classification import LogisticRegression
- from pyspark.ml.feature import HashingTF, Tokenizer
- from pyspark.sql import Row, SQLContext
- """
- A simple text classification pipeline that recognizes "spark" from
- input text. This is to show how to create and configure a Spark ML
- pipeline in Python. Run with:
- bin/spark-submit examples/src/main/python/ml/simple_text_classification_pipeline.py
- """
- if __name__ == "__main__":
- sc = SparkContext(appName="SimpleTextClassificationPipeline")
- sqlContext = SQLContext(sc)
- # Prepare training documents, which are labeled.
- LabeledDocument = Row("id", "text", "label")
- training = sc.parallelize([(0, "a b c d e spark", 1.0),
- (1, "b d", 0.0),
- (2, "spark f g h", 1.0),
- (3, "hadoop mapreduce", 0.0)]) \
- .map(lambda x: LabeledDocument(*x)).toDF()
- # Configure an ML pipeline, which consists of tree stages: tokenizer, hashingTF, and lr.
- tokenizer = Tokenizer(inputCol="text", outputCol="words")
- hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
- lr = LogisticRegression(maxIter=10, regParam=0.001)
- pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
- # Fit the pipeline to training documents.
- model = pipeline.fit(training)
- # Prepare test documents, which are unlabeled.
- Document = Row("id", "text")
- test = sc.parallelize([(4, "spark i j k"),
- (5, "l m n"),
- (6, "spark hadoop spark"),
- (7, "apache hadoop")]) \
- .map(lambda x: Document(*x)).toDF()
- # Make predictions on test documents and print columns of interest.
- prediction = model.transform(test)
- selected = prediction.select("id", "text", "prediction")
- for row in selected.collect():
- print(row)
- sc.stop()
- Status API Training Shop Blog About Pricing
- © 2015 GitHub, Inc. Terms Privacy Security Contact Help
复制代码
扫码加我 拉你入群
请注明:姓名-公司-职位
以便审核进群资格,未注明则拒绝
|