List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:co.nubetech.hiho.dedup.TestDedupJob.java
License:Apache License
@Test public void testDedupByValueWithDelimitedTextInputFormat() throws Exception { final String inputData1 = "Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"; final String inputData2 = "Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-inputFormat", "co.nubetech.hiho.dedup.DelimitedTextInputFormat", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-inputPath", "/input1,/input2", "-outputPath", "output", "-delimeter", ",", "-column", "1", "-dedupBy", "value" }; DedupJob job = runDedupJob(args);/*from w w w. jav a 2s . c o m*/ assertEquals(6, job.getTotalRecordsRead()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); assertEquals(1, job.getDuplicateRecords()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"); expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"); expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"); expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"); expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:co.nubetech.hiho.dedup.TestDedupJob.java
License:Apache License
@Test public void testDedupByValueWithTextInputFormat() throws Exception { final String inputData1 = "Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"; final String inputData2 = "Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "-inputPath", "/input1,/input2", "-outputPath", "output", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-dedupBy", "value" }; DedupJob job = runDedupJob(args);/* ww w . ja v a2s . co m*/ assertEquals(6, job.getTotalRecordsRead()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); assertEquals(1, job.getDuplicateRecords()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"); expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"); expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"); expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"); expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:co.nubetech.hiho.dedup.TestDedupJob.java
License:Apache License
@Test public void testDedupByValueWithSequenceFileAsTextInputFormat() throws Exception { HashMap<Text, Text> inputData1 = new HashMap<Text, Text>(); inputData1.put(new Text("1"), new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney")); inputData1.put(new Text("2"), new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson")); inputData1.put(new Text("3"), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein")); createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq"); HashMap<Text, Text> inputData2 = new HashMap<Text, Text>(); inputData2.put(new Text("1"), new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos")); inputData2.put(new Text("2"), new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson")); inputData2.put(new Text("4"), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein")); createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq"); String[] args = new String[] { "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-inputPath", "/input1,/input2", "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "value" }; DedupJob job = runDedupJob(args);/*w w w . j a va 2 s . com*/ assertEquals(6, job.getTotalRecordsRead()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); assertEquals(1, job.getDuplicateRecords()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"); expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"); expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"); expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"); expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:co.nubetech.hiho.job.TestDBQueryInputJobWithCluster.java
License:Apache License
@Test public void testBasicTableImport() throws Exception { DBQueryInputJob job = new DBQueryInputJob(); String[] args = new String[] { "-jdbcDriver", "org.hsqldb.jdbcDriver", "-jdbcUrl", "jdbc:hsqldb:hsql://localhost/URLAccess", // "-jdbcUsername", "", // "-jdbcPassword", "", "-outputPath", "testBasicTableImport", "-outputStrategy", "delimited", "-delimiter", "DELIM", "-numberOfMappers", "2", "-inputTableName", "Pageview", "-inputOrderBy", "pageview" }; int res = ToolRunner.run(createJobConf(), job, args); assertEquals(0, res);//from w w w. ja v a2s . com //lets verify the result now FileSystem outputFS = getFileSystem(); //Path outputPath = getOutputDir(); Path outputPath = new Path(outputFS.getHomeDirectory(), "testBasicTableImport"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("/aDELIM1000"); expectedOutput.add("/bDELIM2000"); expectedOutput.add("/cDELIM3000"); expectedOutput.add("/dDELIM4000"); expectedOutput.add("/eDELIM5000"); expectedOutput.add("/fDELIM6000"); expectedOutput.add("/gDELIM7000"); expectedOutput.add("/hDELIM8000"); expectedOutput.add("/iDELIM9000"); expectedOutput.add("/jDELIM10000"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(10, count); }
From source file:co.nubetech.hiho.job.TestDBQueryInputJobWithCluster.java
License:Apache License
@Test public void testQueryBasedImport() throws Exception { DBQueryInputJob job = new DBQueryInputJob(); String[] args = new String[] { "-jdbcDriver", "org.hsqldb.jdbcDriver", "-jdbcUrl", "jdbc:hsqldb:hsql://localhost/URLAccess", "-outputPath", "testQueryBasedImport", "-inputQuery", "select url,pageview,commentCount from Pageview, PageComment where Pageview.url = PageComment.url", "-inputBoundingQuery", "select min(commentCount), max(commentCount) from PageComment", "-outputStrategy", "delimited", "-delimiter", "DELIM", "-numberOfMappers", "2", "-inputOrderBy", "Pageview.pageview" }; int res = ToolRunner.run(createJobConf(), job, args); assertEquals(0, res);//from w w w .j ava 2s.c o m //lets verify the result now FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "testQueryBasedImport"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("/aDELIM1000DELIM10"); expectedOutput.add("/bDELIM2000DELIM10"); expectedOutput.add("/cDELIM3000DELIM10"); expectedOutput.add("/dDELIM4000DELIM10"); expectedOutput.add("/eDELIM5000DELIM10"); expectedOutput.add("/fDELIM6000DELIM10"); expectedOutput.add("/gDELIM7000DELIM10"); expectedOutput.add("/hDELIM8000DELIM10"); expectedOutput.add("/iDELIM9000DELIM10"); expectedOutput.add("/jDELIM10000DELIM10"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(10, count); }
From source file:co.nubetech.hiho.mapred.input.FileStreamRecordReader.java
License:Apache License
@Override public FSDataInputStream createValue() { logger.debug("Creating value"); FSDataInputStream stream = null;/*from ww w . j a v a 2 s . c o m*/ Path file = split.getPath(); logger.debug("Path is " + file); fileName = file.getName(); try { FileSystem fs = file.getFileSystem(configuration); stream = new FSDataInputStream(fs.open(file)); } catch (IOException e) { e.printStackTrace(); } logger.debug("Opened stream"); return stream; }
From source file:co.nubetech.hiho.mapreduce.lib.input.FileStreamRecordReader.java
License:Apache License
@Override public boolean nextKeyValue() throws IOException { logger.debug("Inside nextKeyValue"); if (!isRead) { Path file = split.getPath(); logger.debug("Path is " + file); fileName = file.getName();/*from w w w . jav a 2 s . c om*/ FileSystem fs = file.getFileSystem(context.getConfiguration()); stream = fs.open(file); logger.debug("Opened stream"); isRead = true; return true; } return false; }
From source file:co.nubetech.hiho.merge.TestMergeJob.java
License:Apache License
@Test public void testMergeByKeyWithDelimitedTextInputFormat() throws Exception { final String inputData1 = "Macon Kent,6269 Aenean St.,1-247-399-1051,08253" + "\nDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" + "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; final String inputData2 = "Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" + "\nMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" + "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "key", "-outputPath", "output", "-inputFormat", "co.nubetech.hiho.dedup.DelimitedTextInputFormat", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text" }; MergeJob job = runMergeJobs(args);//from ww w .ja v a2 s . co m assertEquals(3, job.getTotalRecordsNew()); assertEquals(3, job.getTotalRecordsOld()); assertEquals(0, job.getBadRecords()); assertEquals(4, job.getOutput()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"); expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"); expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"); expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(4, count); }
From source file:co.nubetech.hiho.merge.TestMergeJob.java
License:Apache License
@Test public void testMergeByValueWithDelimitedTextInputFormat() throws Exception { final String inputData1 = "Macon Kent,6269 Aenean St.,1-247-399-1051,08253" + "\nDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" + "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; final String inputData2 = "Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" + "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714" + "\nTimon Leonard,716 Ac Ave,1-857-935-3882,62240"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "value", "-outputPath", "output", "-inputFormat", "co.nubetech.hiho.dedup.DelimitedTextInputFormat", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", }; MergeJob job = runMergeJobs(args);//from w w w . j a v a 2 s .co m assertEquals(3, job.getTotalRecordsNew()); assertEquals(3, job.getTotalRecordsOld()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"); expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"); expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"); expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"); expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:co.nubetech.hiho.merge.TestMergeJob.java
License:Apache License
@Test public void testMergeByValueWithTextInputFormat() throws Exception { final String inputData1 = "Macon Kent,6269 Aenean St.,1-247-399-1051,08253" + "\nDale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510" + "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; final String inputData2 = "Timon Leonard,716 Ac Ave,1-857-935-3882,62240" + "\nMacaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584" + "\nCharles Wood,525-9709 In Rd.,1-370-528-4758,62714"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "value", "-outputPath", "output", "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat" }; MergeJob job = runMergeJobs(args);// ww w . j a v a 2 s .com assertEquals(3, job.getTotalRecordsNew()); assertEquals(3, job.getTotalRecordsOld()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"); expectedOutput.add("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"); expectedOutput.add("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"); expectedOutput.add("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"); expectedOutput.add("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }