Setting log level to ERROR in scripts that prints to the standard output

Pedro Bernardo · Pedro Bernardo · commit f637b18a6405 · 2017-09-29T08:23:40.000+02:00
diff --git a/rdd/WordCount.py b/rdd/WordCount.py
@@ -2,9 +2,10 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-	sc = SparkContext("local", "word count")
-  	lines = sc.textFile("in/word_count.text")
-  	words = lines.flatMap(lambda line: line.split(" "))
-  	wordCounts = words.countByValue()
-  	for word, count in wordCounts.items():
-  		print(word, count)
+    sc = SparkContext("local", "word count")
+    sc.setLogLevel("ERROR")
+    lines = sc.textFile("in/word_count.text")
+    words = lines.flatMap(lambda line: line.split(" "))
+    wordCounts = words.countByValue()
+    for word, count in wordCounts.items():
+        print(word, count)
diff --git a/rdd/collect/CollectExample.py b/rdd/collect/CollectExample.py
@@ -2,6 +2,7 @@
 
 if __name__ == "__main__":
     sc = SparkContext("local", "collect")
+    sc.setLogLevel("ERROR")
     inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
     wordRdd = sc.parallelize(inputWords)
     words = wordRdd.collect()
diff --git a/rdd/count/CountExample.py b/rdd/count/CountExample.py
@@ -2,10 +2,11 @@
 
 if __name__ == "__main__":
     sc = SparkContext("local", "count")
+    sc.setLogLevel("ERROR")
     inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
     wordRdd = sc.parallelize(inputWords)
     print("Count: {}".format(wordRdd.count()))
     worldCountByValue = wordRdd.countByValue()
     print("CountByValue: ")
     for word, count in worldCountByValue.items():
-        print("{} : {}".format(word, count))
+        print("{} : {}".format(word, count))
diff --git a/rdd/nasaApacheWebLogs/UnionLogSolutions.py b/rdd/nasaApacheWebLogs/UnionLogSolutions.py
@@ -1,6 +1,6 @@
 from pyspark import SparkContext
 
-def isNotHeader(line:str):
+def isNotHeader(line: str):
     return not (line.startswith("host") and "bytes" in line)
 
 if __name__ == "__main__":
diff --git a/rdd/reduce/ReduceExample.py b/rdd/reduce/ReduceExample.py
@@ -2,6 +2,7 @@
 
 if __name__ == "__main__":
     sc = SparkContext("local", "reduce")
+    sc.setLogLevel("ERROR")
     inputIntegers = [1, 2, 3, 4, 5]
     integerRdd = sc.parallelize(inputIntegers)
     product = integerRdd.reduce(lambda x, y: x * y)
diff --git a/rdd/sumOfNumbers/SumOfNumbersProblem.py b/rdd/sumOfNumbers/SumOfNumbersProblem.py
@@ -1,11 +1,10 @@
-
 import sys
 from pyspark import SparkContext
 
 if __name__ == "__main__":
 
-	'''
+    '''
     Create a Spark program to read the first 100 prime numbers from in/prime_nums.text,
     print the sum of those numbers to console.
     Each row of the input file contains 10 prime numbers separated by spaces.
-    '''
+    '''
diff --git a/rdd/sumOfNumbers/SumOfNumbersSolution.py b/rdd/sumOfNumbers/SumOfNumbersSolution.py
@@ -3,9 +3,10 @@
 
 if __name__ == "__main__":
     sc = SparkContext("local", "primeNumbers")
+    sc.setLogLevel("ERROR")
     lines = sc.textFile("in/prime_nums.text")
     numbers = lines.flatMap(lambda line: line.split("\t"))
     validNumbers = numbers.filter(lambda number: number)
     intNumbers = validNumbers.map(lambda number: int(number))
     print("Sum is: ")
-    print(intNumbers.reduce(lambda x, y: x + y))
+    print(intNumbers.reduce(lambda x, y: x + y))
diff --git a/rdd/take/TakeExample.py b/rdd/take/TakeExample.py
@@ -2,9 +2,10 @@
 from pyspark import SparkContext
 
 if __name__ == "__main__":
-	sc = SparkContext("local", "take")
-	inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
-	wordRdd = sc.parallelize(inputWords)
-	words = wordRdd.take(3)
-	for word in words: 
-		print(word)
+    sc = SparkContext("local", "take")
+    sc.setLogLevel("ERROR")
+    inputWords = ["spark", "hadoop", "spark", "hive", "pig", "cassandra", "hadoop"]
+    wordRdd = sc.parallelize(inputWords)
+    words = wordRdd.take(3)
+    for word in words:
+        print(word)