File tree Expand file tree Collapse file tree 6 files changed +56
-7
lines changed
src/main/java/com/sparkTutorial/rdd/nasaApacheWebLogs Expand file tree Collapse file tree 6 files changed +56
-7
lines changed Original file line number Diff line number Diff line change 1
- host logname time method url response bytes referer useragent
1
+ host logname time method url response bytes
2
2
199.72.81.55 - 804571201 GET /history/apollo/ 200 6245
3
3
unicomp6.unicomp.net - 804571206 GET /shuttle/countdown/ 200 3985
4
4
199.120.110.21 - 804571209 GET /shuttle/missions/sts-73/mission-sts-73.html 200 4085
Original file line number Diff line number Diff line change 1
- host logname time method url response bytes referer useragent
1
+ host logname time method url response bytes
2
2
in24.inetnebr.com - 807249601 GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt 200 1839
3
3
uplherc.upl.com - 807249607 GET / 304 0
4
4
uplherc.upl.com - 807249608 GET /images/ksclogo-medium.gif 304 0
Original file line number Diff line number Diff line change
1
+ package com .sparkTutorial .rdd .nasaApacheWebLogs ;
2
+
3
+ public class SameHostsProblem {
4
+
5
+ public static void main (String [] args ) throws Exception {
6
+
7
+ /* "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
8
+ "in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
9
+ Create a Spark program to generate a new RDD which contains the hosts which are accessed on BOTH days.
10
+ Save the resulting RDD to "out/nasa_logs_same_hosts.csv" file.
11
+
12
+ Example output:
13
+ vagrant.vf.mmc.com
14
+ www-a1.proxy.aol.com
15
+ .....
16
+
17
+ Keep in mind, that the original log files contains the following header lines.
18
+ host logname time method url response bytes
19
+
20
+ Make sure the head lines are removed in the resulting RDD.
21
+ */
22
+ }
23
+ }
Original file line number Diff line number Diff line change
1
+ package com .sparkTutorial .rdd .nasaApacheWebLogs ;
2
+
3
+ import org .apache .spark .SparkConf ;
4
+ import org .apache .spark .api .java .JavaRDD ;
5
+ import org .apache .spark .api .java .JavaSparkContext ;
6
+
7
+ public class SameHostsSolution {
8
+
9
+ public static void main (String [] args ) throws Exception {
10
+
11
+ SparkConf conf = new SparkConf ().setAppName ("sameHosts" ).setMaster ("local[1]" );
12
+
13
+ JavaSparkContext sc = new JavaSparkContext (conf );
14
+
15
+ JavaRDD <String > julyFirstLogs = sc .textFile ("in/nasa_19950701.tsv" );
16
+ JavaRDD <String > augustFirstLogs = sc .textFile ("in/nasa_19950801.tsv" );
17
+
18
+ JavaRDD <String > julyFirstHosts = julyFirstLogs .map (line -> line .split ("\t " )[0 ]);
19
+
20
+ JavaRDD <String > augustFirstHosts = augustFirstLogs .map (line -> line .split ("\t " )[0 ]);
21
+
22
+ JavaRDD <String > intersection = julyFirstHosts .intersection (augustFirstHosts );
23
+
24
+ intersection .saveAsTextFile ("out/nasa_logs_same_hosts.csv" );
25
+ }
26
+ }
Original file line number Diff line number Diff line change 1
1
package com .sparkTutorial .rdd .nasaApacheWebLogs ;
2
2
3
- public class NasaApacheWebLogsProblem {
3
+ public class UnionLogProblem {
4
4
5
5
public static void main (String [] args ) throws Exception {
6
6
7
7
/* "in/nasa_19950701.tsv" file contains 10000 log lines from one of NASA's apache server for July 1st, 1995.
8
8
"in/nasa_19950801.tsv" file contains 10000 log lines for August 1st, 1995
9
- Create a Spark program to create a new RDD which contains the log lines from both July 1st and August 1st,
9
+ Create a Spark program to generate a new RDD which contains the log lines from both July 1st and August 1st,
10
10
take a 0.1 sample of those log lines and save it to "out/sample_nasa_logs.tsv" file.
11
11
12
12
Keep in mind, that the original log files contains the following header lines.
13
- host logname time method url response bytes referer useragent
13
+ host logname time method url response bytes
14
14
15
15
Make sure the head lines are removed in the resulting RDD.
16
16
*/
Original file line number Diff line number Diff line change 4
4
import org .apache .spark .api .java .JavaRDD ;
5
5
import org .apache .spark .api .java .JavaSparkContext ;
6
6
7
- public class NasaApacheWebLogsSolution {
7
+ public class UnionLogsSolution {
8
8
9
9
public static void main (String [] args ) throws Exception {
10
10
11
- SparkConf conf = new SparkConf ().setAppName ("nasaApacheWebLogs " ).setMaster ("local[*]" );
11
+ SparkConf conf = new SparkConf ().setAppName ("unionLogs " ).setMaster ("local[*]" );
12
12
13
13
JavaSparkContext sc = new JavaSparkContext (conf );
14
14
You can’t perform that action at this time.
0 commit comments