变更
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
from pyspark import SparkConf, SparkContext
|
||||
import os
|
||||
|
||||
os.environ["PYSPARK_PYTHON"] = "D:\programtool\conda\python"
|
||||
|
||||
# 创建SparcConf类对象
|
||||
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
|
||||
# 基于SparkConf类对象创建SparkContext类对象
|
||||
sc = SparkContext(conf=conf)
|
||||
# 打印版本
|
||||
# print(sc.version)
|
||||
|
||||
# 通过parallelize 方法将python对象加载到Spark内,成为RDD对象
|
||||
rdd1 = sc.parallelize([1, 2, 3, 4, 5])
|
||||
rdd2 = sc.parallelize((1, 2, 4, 5, 6))
|
||||
rdd3 = sc.parallelize("342sdf")
|
||||
|
||||
|
||||
# print(rdd1.collect())
|
||||
# print(rdd2.collect())
|
||||
# print(rdd3.collect())
|
||||
|
||||
# rdd4 = sc.textFile("C语言程序设计(上).北京理工大学.csv")
|
||||
# print(rdd4.collect())
|
||||
|
||||
def func(data):
|
||||
return data * 10
|
||||
|
||||
|
||||
# 通过map方法将rdd1全部数据乘以10
|
||||
rdd5 = rdd1.map(func)
|
||||
# print(rdd5.collect())
|
||||
|
||||
# 通过flatMap方法进行解除嵌套
|
||||
rdd6 = sc.parallelize(["hello world 232", 'welcome learn python 233'])
|
||||
rdd7 = rdd6.map(lambda x: x.split(" "))
|
||||
rdd8 = rdd6.flatMap(lambda x: x.split(" "))
|
||||
# print(rdd7.collect())
|
||||
# print(rdd8.collect())
|
||||
|
||||
# reduceByKey两两计算
|
||||
rdd9 = sc.parallelize([('男', 88), ("女", 68), ('男', 48), ('女', 38)])
|
||||
rdd10 = rdd9.reduceByKey(lambda a, b: a + b)
|
||||
print(rdd10.collect())
|
||||
|
||||
# 停止SparkContext对象运行
|
||||
sc.stop()
|
||||
Reference in New Issue
Block a user