准备文件及脚本层级
build用于存放scala编译后的类,src用于存放scala源码
compilescala.sh是编译命令,run_wordcount.sh是以spark submit形式向集群提交任务命令
$ ls
build compilescala.sh run_wordcount.sh src
scala源码
task1:词频统计
$ cat ./src/wordcount.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object wordcount{
val conf = new SparkConf().setAppName("WordCount")
val sc = new SparkContext(conf)
def main(args:Array[String]){
val inputpath = args(0)
val outputpath = args(1)
val textFile = sc.textFile(inputpath)
val wordCounts = textFile.flatMap(line => line.split(" "))
.map(word => (word,1)).reduceByKey((a,b) => a+b).collect() //词频统计
sc.parallelize(wordCounts).repartition(2).saveAsTextFile(outputpath) //生成2个hdfs part
sc.stop()
}
}
$ cat compilescala.sh
rm -rf build/classes/*
scalac -d build/classes -cp /usr/bin/hadoop/software/spark/lib/spark-assembly-1.6.0-U22-hadoop2.7.2U6.jar src/wordcount.scala
jar -cvf build/wordcount.jar -C build/classes .
#spark-submit --class click_diary build/click_diary.jar --master local[4]
$ ./compilescala.sh
已添加清单
正在添加: wordcount$$anonfun$2.class(输入 = 1325) (输出 = 644)(压缩了 51%)
正在添加: wordcount$$anonfun$3.class(输入 = 1215) (输出 = 609)(压缩了 49%)
正在添加: wordcount$$anonfun$1.class(输入 = 1054) (输出 = 599)(压缩了 43%)
正在添加: wordcount$.class(输入 = 3186) (输出 = 1422)(压缩了 55%)
正在添加: wordcount.class(输入 = 1029) (输出 = 744)(压缩了 27%)
数据文件上传到hdfs集群inputpath目录下
$ cat run_wordcount.sh
inputpath="/home/users/Spark/test/README.md"
outputpath="/home/users/Spark/test/wordcount/"
hadoop fs -rmr ${outputpath}
spark-submit \
--conf spark.storage.memoryFraction=0.6 \
--conf spark.kryoserializer.buffer.max=512m \
--conf spark.kryoserializer.buffer=64m \
--class "wordcount" \
--driver-memory 6g \
--num-executors 10 \
--executor-cores 2 \
--executor-memory 6g \
--master yarn-client \
--priority HIGH \
build/wordcount.jar \
${inputpath} \
${outputpath}
$./run_wordcount.sh

task2:统计每行单词数
$ cat ./src/wordcount.scala
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import java.lang.Math
object wordcount{
val conf = new SparkConf().setAppName("WordCount")
val sc = new SparkContext(conf)
def main(args:Array[String]){
val inputpath = args(0)
val outputpath = args(1)
val textFile = sc.textFile(inputpath)
//val cnt = textFile.count()
//val fir = textFile.first()
//Spark可以将数据集存放在集群中的缓存中,在数据集经常被访问的场景下很有用,如PageRank这样的需要迭代很多次的算法。
var linesWithSpark = textFile.filter(line => line.contains("Spark")).cache() //有多少行包含Spark
var size = linesWithSpark.map(line => line.split(" ").size) //每行包含多少词
//var maxsize = size.reduce((a,b) => if (a>b) a else b) //查看单词最多的行有多少词
var maxsize0 = size.reduce((a,b) => Math.max(a,b))
println(maxsize0)
val select = linesWithSpark.count()
println(select)
//linesWithSpark.saveAsTextFile(outputpath)
size.saveAsTextFile(outputpath)
sc.stop()
}
}

本文介绍了一个使用Apache Spark进行词频统计和行单词数分析的项目。通过Scala脚本,我们实现了从HDFS读取数据,进行词频统计,并将结果保存回HDFS的功能。此外,还展示了如何统计包含特定关键词的行数及其平均单词数。
1341

被折叠的 条评论
为什么被折叠?



