Java tutorial
/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.URI; import java.net.URISyntaxException; import java.util.regex.Pattern; import cascading.ClusterTestCase; import cascading.cascade.Cascade; import cascading.cascade.CascadeConnector; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.MultiMapReducePlanner; import cascading.operation.Function; import cascading.operation.Identity; import cascading.operation.regex.RegexSplitter; import cascading.pipe.Each; import cascading.pipe.Pipe; import cascading.scheme.SequenceFile; import cascading.scheme.TextDelimited; import cascading.scheme.TextLine; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryIterator; import org.apache.hadoop.mapred.JobConf; /** * */ public class TapTest extends ClusterTestCase implements Serializable { String inputFileComments = "build/test/data/comments+lower.txt"; String inputFileJoined = "build/test/data/lower+upper.txt"; String inputFileCross = "build/test/data/lhs+rhs-cross.txt"; String inputFileUpper = "build/test/data/upper.txt"; String inputFileLower = "build/test/data/lower.txt"; String outputPath = "build/test/output/tap/"; public TapTest() { super("tap tests", true); } public void testDfs() throws URISyntaxException, IOException { Tap tap = new Dfs(new Fields("foo"), "some/path"); assertTrue("wrong scheme", tap.getQualifiedPath(MultiMapReducePlanner.getJobConf(getProperties())).toUri() .getScheme().equalsIgnoreCase("hdfs")); new Dfs(new Fields("foo"), "hdfs://localhost:5001/some/path"); new Dfs(new Fields("foo"), new URI("hdfs://localhost:5001/some/path")); try { new Dfs(new Fields("foo"), "s3://localhost:5001/some/path"); fail("not valid url"); } catch (Exception exception) { } try { new Dfs(new Fields("foo"), new URI("s3://localhost:5001/some/path")); fail("not valid url"); } catch (Exception exception) { } } public void testS3fs() throws URISyntaxException, IOException { // don't test qualified path, it tries to connect to s3 service new S3fs(new Fields("foo"), "s3://localhost:5001/some/path"); new S3fs(new Fields("foo"), new URI("s3://localhost:5001/some/path")); try { new S3fs(new Fields("foo"), "hdfs://localhost:5001/some/path"); fail("not valid url"); } catch (Exception exception) { } try { new S3fs(new Fields("foo"), new URI("hdfs://localhost:5001/some/path")); fail("not valid url"); } catch (Exception exception) { } } public void testLfs() throws URISyntaxException, IOException { Tap tap = new Lfs(new Fields("foo"), "some/path"); assertTrue("wrong scheme", tap.getQualifiedPath(MultiMapReducePlanner.getJobConf(getProperties())).toUri() .getScheme().equalsIgnoreCase("file")); new Lfs(new Fields("foo"), "file:///some/path"); try { new Lfs(new Fields("foo"), "s3://localhost:5001/some/path"); fail("not valid url"); } catch (Exception exception) { } } public class CommentScheme extends TextLine { public CommentScheme() { } public CommentScheme(Fields sourceFields) { super(sourceFields); } @Override public Tuple source(Object key, Object value) { if (value.toString().matches("^\\s*#.*$")) return null; return super.source(key, value); } } public void testNullsFromScheme() throws IOException { if (!new File(inputFileComments).exists()) fail("data file not found"); copyFromLocal(inputFileComments); Tap source = new Hfs(new CommentScheme(new Fields("line")), inputFileComments); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new Identity()); Tap sink = new Hfs(new TextLine(1), outputPath + "/testnulls", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); validateLength(flow, 5, null); TupleEntryIterator iterator = flow.openSink(); assertEquals("not equal: tuple.get(1)", "1 a", iterator.next().get(1)); iterator.close(); // confirm the tuple iterator can handle nulls from the source validateLength(flow.openSource(), 5); } public void testTemplateTap() throws IOException { if (!new File(inputFileJoined).exists()) fail("data file not found"); copyFromLocal(inputFileJoined); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t")); Tap sink = new Hfs(new TextLine(1), outputPath + "/testtemplates", true); sink = new TemplateTap((Hfs) sink, "%s-%s", 1); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); Tap test = new Hfs(new TextLine(1), sink.getPath().toString() + "/1-a"); validateLength(flow.openTapForRead(test), 1); test = new Hfs(new TextLine(1), sink.getPath().toString() + "/2-b"); validateLength(flow.openTapForRead(test), 1); } public void testTemplateTapTextDelimited() throws IOException { if (!new File(inputFileJoined).exists()) fail("data file not found"); copyFromLocal(inputFileJoined); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t")); Tap sink = new Hfs(new TextDelimited(new Fields("number", "lower", "upper"), "+"), outputPath + "/testdelimitedtemplates", true); sink = new TemplateTap((Hfs) sink, "%s-%s", 1); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); Tap test = new Hfs(new TextLine(new Fields("line")), sink.getPath().toString() + "/1-a"); validateLength(flow.openTapForRead(test), 1, Pattern.compile("[0-9]\\+[a-z]\\+[A-Z]")); test = new Hfs(new TextLine(new Fields("line")), sink.getPath().toString() + "/2-b"); validateLength(flow.openTapForRead(test), 1, Pattern.compile("[0-9]\\+[a-z]\\+[A-Z]")); } public void testTemplateTapView() throws IOException { if (!new File(inputFileJoined).exists()) fail("data file not found"); copyFromLocal(inputFileJoined); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t")); Tap sink = new Hfs(new SequenceFile(new Fields("upper")), outputPath + "/testtemplatesview", true); sink = new TemplateTap((Hfs) sink, "%s-%s", new Fields("number", "lower"), 1); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); Tap test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/1-a"); validateLength(flow.openTapForRead(test), 1, 1); test = new Hfs(new SequenceFile(new Fields("upper")), sink.getPath().toString() + "/2-b"); validateLength(flow.openTapForRead(test), 1, 1); TupleEntryIterator input = flow.openTapForRead(test); // open 2-b assertEquals("wrong value", "B", input.next().get(0)); input.close(); } public void testSinkDeclaredFields() throws IOException { if (!new File(inputFileCross).exists()) fail("data file not found"); copyFromLocal(inputFileCross); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("first", "second", "third"), "\\s"), Fields.ALL); Tap sink = new Hfs(new TextLine(new Fields("line"), new Fields("second", "first", "third")), outputPath + "/declaredsinks", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); // flow.writeDOT( "declaredsinks.dot" ); flow.complete(); validateLength(flow, 37, null); TupleEntryIterator iterator = flow.openSink(); String line = iterator.next().getString(0); assertTrue("not equal: wrong values", line.matches("[a-z]\t[0-9]\t[A-Z]")); iterator.close(); } public void testSinkUnknown() throws IOException { if (!new File(inputFileCross).exists()) fail("data file not found"); copyFromLocal(inputFileCross); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileCross); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("first", "second", "third"), "\\s"), Fields.RESULTS); Tap sink = new Hfs(new SequenceFile(Fields.UNKNOWN), outputPath + "/unknownsinks", true); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); validateLength(flow, 37, null); TupleEntryIterator iterator = flow.openSink(); String line = iterator.next().getTuple().toString(); assertTrue("not equal: wrong values: " + line, line.matches("[0-9]\t[a-z]\t[A-Z]")); iterator.close(); } public void testMultiSinkTap() throws IOException { if (!new File(inputFileJoined).exists()) fail("data file not found"); copyFromLocal(inputFileJoined); Tap source = new Hfs(new TextLine(new Fields("line")), inputFileJoined); Pipe pipe = new Pipe("test"); pipe = new Each(pipe, new RegexSplitter(new Fields("number", "lower", "upper"), "\t")); Tap lhsSink = new Hfs(new TextLine(new Fields("offset", "line"), new Fields("number", "lower")), outputPath + "/multisink/lhs", SinkMode.REPLACE); Tap rhsSink = new Hfs(new TextLine(new Fields("offset", "line"), new Fields("number", "upper")), outputPath + "/multisink/rhs", SinkMode.REPLACE); Tap sink = new MultiSinkTap(lhsSink, rhsSink); Flow flow = new FlowConnector(getProperties()).connect(source, sink, pipe); flow.complete(); validateLength(flow.openTapForRead(lhsSink), 5); validateLength(flow.openTapForRead(rhsSink), 5); } public void testGlobHfs() throws Exception { if (!new File(inputFileLower).exists()) fail("data file not found"); copyFromLocal(inputFileLower); copyFromLocal(inputFileUpper); GlobHfs source = new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/data/?{ppe[_r],owe?}.txt"); assertEquals(2, source.getTaps().length); // show globhfs will just match a directory if ended with a / assertEquals(1, new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/?ata/").getTaps().length); // using null pos so all fields are written Tap sink = new Hfs(new TextLine(), outputPath + "/glob/", true); Function splitter = new RegexSplitter(new Fields("num", "char"), "\\s"); Pipe concatPipe = new Each(new Pipe("concat"), new Fields("line"), splitter); Flow concatFlow = new FlowConnector(getProperties()).connect("first", source, sink, concatPipe); Tap nextSink = new Hfs(new TextLine(), outputPath + "/glob2/", true); Flow nextFlow = new FlowConnector(getProperties()).connect("second", sink, nextSink, concatPipe); Cascade cascade = new CascadeConnector().connect(concatFlow, nextFlow); cascade.complete(); // countFlow.writeDOT( "cogroup.dot" ); // System.out.println( "countFlow =\n" + countFlow ); validateLength(concatFlow, 10, null); } public void testNestedMultiSource() throws Exception { if (!new File(inputFileLower).exists()) fail("data file not found"); copyFromLocal(inputFileLower); copyFromLocal(inputFileUpper); GlobHfs source1 = new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/data/?{ppe[_r]}.txt"); GlobHfs source2 = new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/data/?{owe?}.txt"); MultiSourceTap source = new MultiSourceTap(source1, source2); assertEquals(2, source.getTaps().length); // using null pos so all fields are written Tap sink = new Hfs(new TextLine(), outputPath + "/glob/", true); Function splitter = new RegexSplitter(new Fields("num", "char"), "\\s"); Pipe concatPipe = new Each(new Pipe("concat"), new Fields("line"), splitter); Flow concatFlow = new FlowConnector(getProperties()).connect("first", source, sink, concatPipe); Tap nextSink = new Hfs(new TextLine(), outputPath + "/glob2/", true); Flow nextFlow = new FlowConnector(getProperties()).connect("second", sink, nextSink, concatPipe); Cascade cascade = new CascadeConnector().connect(concatFlow, nextFlow); cascade.complete(); validateLength(concatFlow, 10, null); } public void testMultiSourceIterator() throws Exception { if (!new File(inputFileLower).exists()) fail("data file not found"); copyFromLocal(inputFileLower); copyFromLocal(inputFileUpper); Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower); Tap sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), inputFileUpper); Tap source = new MultiSourceTap(sourceLower, sourceUpper); validateLength(source.openForRead(new JobConf()), 10, null); GlobHfs source1 = new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/data/?{ppe[_r]}.txt"); GlobHfs source2 = new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/data/?{owe?}.txt"); source = new MultiSourceTap(source1, source2); validateLength(source.openForRead(new JobConf()), 10, null); GlobHfs sourceMulti = new GlobHfs(new TextLine(new Fields("offset", "line")), "build/test/data/?{ppe[_r],owe?}.txt"); source = new MultiSourceTap(sourceMulti); validateLength(source.openForRead(new JobConf()), 10, null); } }