Skip to content

Commit 0df5885

Browse files
author
James Lee
committed
add SameHostsProblem and SameHostsSolution
1 parent e6b73d4 commit 0df5885

File tree

6 files changed

+56
-7
lines changed

6 files changed

+56
-7
lines changed

in/nasa_19950701.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
host logname time method url response bytes referer useragent
1+
host logname time method url response bytes
22
199.72.81.55 - 804571201 GET /history/apollo/ 200 6245
33
unicomp6.unicomp.net - 804571206 GET /shuttle/countdown/ 200 3985
44
199.120.110.21 - 804571209 GET /shuttle/missions/sts-73/mission-sts-73.html 200 4085

in/nasa_19950801.tsv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
host logname time method url response bytes referer useragent
1+
host logname time method url response bytes
22
in24.inetnebr.com - 807249601 GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt 200 1839
33
uplherc.upl.com - 807249607 GET / 304 0
44
uplherc.upl.com - 807249608 GET /images/ksclogo-medium.gif 304 0
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package com.sparkTutorial.rdd.nasaApacheWebLogs;
2+
3+
public class SameHostsProblem {
4+
5+
public static void main(String[] args) throws Exception {
6+
7+
/* "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
8+
"in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
9+
Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
10+
Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.
11+
12+
Example output:
13+
vagrant.vf.mmc.com
14+
www-a1.proxy.aol.com
15+
.....
16+
17+
Keep in mind, that the original log files contains the following header lines.
18+
host logname time method url response bytes
19+
20+
Make sure the head lines are removed in the resulting RDD.
21+
*/
22+
}
23+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
package com.sparkTutorial.rdd.nasaApacheWebLogs;
2+
3+
import org.apache.spark.SparkConf;
4+
import org.apache.spark.api.java.JavaRDD;
5+
import org.apache.spark.api.java.JavaSparkContext;
6+
7+
public class SameHostsSolution {
8+
9+
public static void main(String[] args) throws Exception {
10+
11+
SparkConf conf = new SparkConf().setAppName("sameHosts").setMaster("local[1]");
12+
13+
JavaSparkContext sc = new JavaSparkContext(conf);
14+
15+
JavaRDD<String> julyFirstLogs = sc.textFile("in/nasa_19950701.tsv");
16+
JavaRDD<String> augustFirstLogs = sc.textFile("in/nasa_19950801.tsv");
17+
18+
JavaRDD<String> julyFirstHosts = julyFirstLogs.map(line -> line.split("\t")[0]);
19+
20+
JavaRDD<String> augustFirstHosts = augustFirstLogs.map(line -> line.split("\t")[0]);
21+
22+
JavaRDD<String> intersection = julyFirstHosts.intersection(augustFirstHosts);
23+
24+
intersection.saveAsTextFile("out/nasa_logs_same_hosts.csv");
25+
}
26+
}

src/main/java/com/sparkTutorial/rdd/nasaApacheWebLogs/NasaApacheWebLogsProblem.java renamed to src/main/java/com/sparkTutorial/rdd/nasaApacheWebLogs/UnionLogProblem.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,16 @@
11
package com.sparkTutorial.rdd.nasaApacheWebLogs;
22

3-
public class NasaApacheWebLogsProblem {
3+
public class UnionLogProblem {
44

55
public static void main(String[] args) throws Exception {
66

77
/* "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
88
"in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
9-
Create a Spark program to create a new RDD which contains the log lines from both July 1st and August 1st,
9+
Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
1010
take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file.
1111
1212
Keep in mind, that the original log files contains the following header lines.
13-
host logname time method url response bytes referer useragent
13+
host logname time method url response bytes
1414
1515
Make sure the head lines are removed in the resulting RDD.
1616
*/

src/main/java/com/sparkTutorial/rdd/nasaApacheWebLogs/NasaApacheWebLogsSolution.java renamed to src/main/java/com/sparkTutorial/rdd/nasaApacheWebLogs/UnionLogsSolution.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
import org.apache.spark.api.java.JavaRDD;
55
import org.apache.spark.api.java.JavaSparkContext;
66

7-
public class NasaApacheWebLogsSolution {
7+
public class UnionLogsSolution {
88

99
public static void main(String[] args) throws Exception {
1010

11-
SparkConf conf = new SparkConf().setAppName("nasaApacheWebLogs").setMaster("local[*]");
11+
SparkConf conf = new SparkConf().setAppName("unionLogs").setMaster("local[*]");
1212

1313
JavaSparkContext sc = new JavaSparkContext(conf);
1414

0 commit comments

Comments
 (0)