Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading; import java.io.File; import java.io.IOException; import java.text.ParseException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.MultiMapReducePlanner; import cascading.operation.Identity; import cascading.operation.Insert; import cascading.operation.regex.RegexParser; import cascading.operation.regex.RegexSplitter; import cascading.operation.text.DateParser; import cascading.pipe.CoGroup; import cascading.pipe.Each; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.Lfs; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryIterator; import cascading.util.Util; import org.apache.hadoop.mapred.JobConf; public class SortedValuesTest extends ClusterTestCase { String inputFileApache = "build/test/data/apache.200.txt"; String inputFileIps = "build/test/data/ips.20.txt"; String inputFileCross = "build/test/data/lhs+rhs-cross.txt"; String outputPath = "build/test/output/sorting/"; private String apacheCommonRegex = TestConstants.APACHE_COMMON_REGEX; private RegexParser apacheCommonParser = new RegexParser( new Fields("ip", "time", "method", "event", "status", "size"), apacheCommonRegex, new int[] { 1, 2, 3, 4, 5, 6 }); public SortedValuesTest() { super("sorted values", false); // disable cluster } public void testCoGroupComparatorValues() throws Exception { runCoGroupComparatorTest("cogroupcompareforward", false); } public void testCoGroupComparatorValuesReversed() throws Exception { runCoGroupComparatorTest("cogroupcomparereversed", true); } private void runCoGroupComparatorTest(String path, boolean reverseSort) throws IOException, ParseException { if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache); copyFromLocal(inputFileIps); Tap sourceApache = new Hfs(new TextLine(), inputFileApache); Tap sourceIP = new Hfs(new TextLine(), inputFileIps); Tap sink = new Hfs(new TextLine(), outputPath + path, true); Pipe apachePipe = new Pipe("apache"); apachePipe = new Each(apachePipe, new Fields("line"), apacheCommonParser); apachePipe = new Each(apachePipe, new Insert(new Fields("col"), 1), Fields.ALL); apachePipe = new Each(apachePipe, new Fields("ip"), new RegexParser(new Fields("octet"), "^[^.]*"), new Fields("col", "status", "event", "octet", "size")); apachePipe = new Each(apachePipe, new Fields("octet"), new Identity(long.class), Fields.REPLACE); Fields groupApache = new Fields("octet"); groupApache.setComparator("octet", new TestLongComparator(reverseSort)); Pipe ipPipe = new Pipe("ip"); ipPipe = new Each(ipPipe, new Fields("line"), new Identity(new Fields("rawip"))); ipPipe = new Each(ipPipe, new Fields("rawip"), new RegexParser(new Fields("rawoctet"), "^[^.]*"), new Fields("rawoctet")); ipPipe = new Each(ipPipe, new Fields("rawoctet"), new Identity(long.class), Fields.REPLACE); Fields groupIP = new Fields("rawoctet"); groupIP.setComparator("rawoctet", new TestLongComparator(reverseSort)); Pipe pipe = new CoGroup(apachePipe, groupApache, ipPipe, groupIP); pipe = new Each(pipe, new Identity()); // let's force the stack to be exercised Map<Object, Object> properties = getProperties(); if (MultiMapReducePlanner.getJobConf(properties) != null) MultiMapReducePlanner.getJobConf(properties).setNumMapTasks(13); Map sources = new HashMap(); sources.put("apache", sourceApache); sources.put("ip", sourceIP); Flow flow = new FlowConnector(properties).connect(sources, sink, pipe); flow.complete(); validateFile(sink, 199, 16, reverseSort, 5); } public void testComprehensiveGroupBy() throws IOException { Boolean[][] testArray = new Boolean[][] { // test group comparators { false, null, false }, { true, null, false }, // test group, reversed { false, null, true }, { true, null, true }, // test group and sort comparators { false, false, false }, { true, false, false }, { true, true, false }, { false, true, false }, // test group and sort comparators, reversed { false, false, true }, { true, false, true }, { true, true, true }, { false, true, true } }; for (int i = 0; i < testArray.length; i++) runComprehensiveCase(testArray[i], false); for (int i = 0; i < testArray.length; i++) runComprehensiveCase(testArray[i], true); } private void runComprehensiveCase(Boolean[] testCase, boolean useCollectionsComparator) throws IOException { if (!new File(inputFileCross).exists()) fail("data file not found"); copyFromLocal(inputFileCross); String test = Util.join(testCase, "_", true); String path = "comprehensive/" + test; Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross); Tap sink = new Hfs(new TextLine(new Fields("line"), new Fields("num", "lower", "upper"), 1), outputPath + path, true); Pipe pipe = new Pipe("comprehensivesort"); pipe = new Each(pipe, new Fields("line"), new RegexSplitter(new Fields("num", "lower", "upper"), "\\s")); pipe = new Each(pipe, new Fields("num"), new Identity(long.class), Fields.REPLACE); Fields groupFields = new Fields("num"); if (testCase[0]) groupFields.setComparator("num", useCollectionsComparator ? Collections.reverseOrder() : new TestLongComparator()); Fields sortFields = null; if (testCase[1] != null) { sortFields = new Fields("upper"); if (testCase[1]) sortFields.setComparator("upper", useCollectionsComparator ? Collections.reverseOrder() : new TestStringComparator()); } pipe = new GroupBy(pipe, groupFields, sortFields, testCase[2]); Map<Object, Object> properties = getProperties(); if (MultiMapReducePlanner.getJobConf(properties) != null) MultiMapReducePlanner.getJobConf(properties).setNumMapTasks(13); Flow flow = new FlowConnector(properties).connect(source, sink, pipe); flow.complete(); validateCase(test, testCase, sink); } private void validateCase(String test, Boolean[] testCase, Tap sink) throws IOException { TupleEntryIterator iterator = sink.openForRead(new JobConf()); LinkedHashMap<Long, List<String>> group = new LinkedHashMap<Long, List<String>>(); while (iterator.hasNext()) { Tuple tuple = iterator.next().getTuple(); String[] values = tuple.getString(0).split("\\s"); long num = Long.parseLong(values[0]); if (!group.containsKey(num)) group.put(num, new ArrayList<String>()); group.get(num).add(values[2]); } boolean groupIsReversed = testCase[0]; if (testCase[2]) groupIsReversed = !groupIsReversed; compare("grouping+" + test, groupIsReversed, group.keySet()); if (testCase[1] == null) return; boolean valueIsReversed = testCase[1]; if (testCase[2]) valueIsReversed = !valueIsReversed; for (Long grouping : group.keySet()) compare("values+" + test, valueIsReversed, group.get(grouping)); } private void compare(String test, boolean isReversed, Collection values) { List<Object> groups = new ArrayList<Object>(values); List<Object> sortedGroups = new ArrayList<Object>(groups); Collections.sort(sortedGroups, isReversed ? Collections.reverseOrder() : null); assertEquals(test, sortedGroups, groups); } public void testSortFails() throws Exception { String path = "fails"; if (!new File(inputFileApache).exists()) fail("data file not found"); copyFromLocal(inputFileApache); Tap source = new Lfs(new TextLine(), inputFileApache); Tap sink = new Lfs(new TextLine(), outputPath + path, true); Pipe pipe = new Pipe("apache"); // RegexParser.APACHE declares: "time", "method", "event", "status", "size" pipe = new Each(pipe, new Fields("line"), apacheCommonParser); pipe = new Each(pipe, new Insert(new Fields("col"), 1), Fields.ALL); // DateParser.APACHE declares: "ts" pipe = new Each(pipe, new Fields("time"), new DateParser("dd/MMM/yyyy:HH:mm:ss Z"), new Fields("col", "status", "ts", "event", "ip", "size")); pipe = new GroupBy(pipe, new Fields("col"), new Fields("does-not-exist")); pipe = new Each(pipe, new Identity()); // let's force the stack to be exercised Map<Object, Object> properties = getProperties(); MultiMapReducePlanner.getJobConf(properties).setNumMapTasks(13); try { new FlowConnector(properties).connect(source, sink, pipe); fail("did not throw exception"); } catch (Exception exception) { // passes } } private void validateFile(Tap tap, int length, int uniqueValues, boolean isReversed, int comparePosition) throws IOException, ParseException { TupleEntryIterator iterator = tap.openForRead(new JobConf()); Set<Long> values = new HashSet<Long>(); long lastValue = isReversed ? Long.MAX_VALUE : Long.MIN_VALUE; int count = 0; while (iterator.hasNext()) { Tuple tuple = iterator.next().getTuple(); count++; tuple = new Tuple((Object[]) tuple.getString(1).split("\t")); long value = tuple.getLong(comparePosition); values.add(value); if (isReversed) assertTrue("out of order in " + tap, lastValue >= value); else assertTrue("out of order in " + tap, lastValue <= value); lastValue = value; } if (length != -1) assertEquals("length of " + tap, length, count); if (uniqueValues != -1) assertEquals("unique values of " + tap, uniqueValues, values.size()); } }