Example usage for org.apache.hadoop.fs FileSystem getHomeDirectory

List of usage examples for org.apache.hadoop.fs FileSystem getHomeDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getHomeDirectory.

Prototype

public Path getHomeDirectory() 

Source Link

Document

Return the current user's home directory in this FileSystem.

Usage

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByIntWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new IntWritable(2),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new IntWritable(3),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new IntWritable(2),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new IntWritable(4),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
            "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "key" };
    DedupJob job = runDedupJob(args);// ww  w  .  ja  v  a 2  s . c om
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());
    assertEquals(2, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<IntWritable> expectedOutput = new ArrayList<IntWritable>();
    expectedOutput.add(new IntWritable(1));
    expectedOutput.add(new IntWritable(2));
    expectedOutput.add(new IntWritable(3));
    expectedOutput.add(new IntWritable(4));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new IntWritable(2),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new IntWritable(3),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new IntWritable(2),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new IntWritable(4),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
            "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);//from   ww w.j  a  va 2  s.  c  om
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<Text> expectedOutput = new ArrayList<Text>();
    expectedOutput.add(new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    expectedOutput.add(new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    expectedOutput.add(new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    expectedOutput.add(new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    expectedOutput.add(new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableValue, expectedOutput.contains(writableValue));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(5, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByLongWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<LongWritable, Text> inputData1 = new HashMap<LongWritable, Text>();
    inputData1.put(new LongWritable(1),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new LongWritable(2),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new LongWritable(3),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<LongWritable, Text> inputData2 = new HashMap<LongWritable, Text>();
    inputData2.put(new LongWritable(1),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new LongWritable(2),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new LongWritable(4),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.LongWritable",
            "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "key" };
    DedupJob job = runDedupJob(args);// w w  w .j av  a  2  s .co m
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());
    assertEquals(2, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<LongWritable> expectedOutput = new ArrayList<LongWritable>();
    expectedOutput.add(new LongWritable(1));
    expectedOutput.add(new LongWritable(2));
    expectedOutput.add(new LongWritable(3));
    expectedOutput.add(new LongWritable(4));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithSequenceFileAsTextInputFormat() throws Exception {
    HashMap<Text, Text> inputData1 = new HashMap<Text, Text>();
    inputData1.put(new Text("1"),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new Text("2"),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new Text("3"),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<Text, Text> inputData2 = new HashMap<Text, Text>();
    inputData2.put(new Text("1"),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new Text("2"),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new Text("4"),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-outputFormat",
            "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName",
            "org.apache.hadoop.io.Text", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);/*from w  w  w . ja va2 s .c  om*/
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney");
    expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson");
    expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein");
    expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos");
    expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(5, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByCustomObjectKeyWithSequenceFileInputFormat() throws Exception {
    Student student1 = setStudent(new Text("Sam"), new Text("US"), new IntWritable(1),
            new LongWritable(9999999998l), new DoubleWritable(99.12));
    Student student2 = setStudent(new Text("John"), new Text("AUS"), new IntWritable(2),
            new LongWritable(9999999999l), new DoubleWritable(90.12));
    Student student3 = setStudent(new Text("Mary"), new Text("UK"), new IntWritable(3),
            new LongWritable(9999999988l), new DoubleWritable(69.12));
    Student student4 = setStudent(new Text("Kelvin"), new Text("UK"), new IntWritable(4),
            new LongWritable(9999998888l), new DoubleWritable(59.12));

    HashMap<Student, Text> inputData1 = new HashMap<Student, Text>();
    inputData1.put(student1, new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(student2, new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(student3, new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<Student, Text> inputData2 = new HashMap<Student, Text>();
    inputData2.put(student2, new Text("Austin Farley,4794 Donec Ave,1-230-823-8164,13508"));
    inputData2.put(student3, new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(student4, new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputPath", "/input1,/input2", "-outputPath", "output", "-dedupBy", "key",
            "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
            "-inputKeyClassName", "co.nubetech.hiho.testdata.Student", "-inputValueClassName",
            "org.apache.hadoop.io.Text" };
    DedupJob job = runDedupJob(args);/*from  w  w w  .  j a va 2 s  .  co  m*/
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());
    assertEquals(2, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<Student> expectedOutput = new ArrayList<Student>();
    expectedOutput.add(student1);
    expectedOutput.add(student2);
    expectedOutput.add(student3);
    expectedOutput.add(student4);
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.job.TestDBQueryInputJobWithCluster.java

License:Apache License

@Test
public void testBasicTableImport() throws Exception {
    DBQueryInputJob job = new DBQueryInputJob();

    String[] args = new String[] { "-jdbcDriver", "org.hsqldb.jdbcDriver", "-jdbcUrl",
            "jdbc:hsqldb:hsql://localhost/URLAccess",
            //   "-jdbcUsername", "",
            //   "-jdbcPassword", "",
            "-outputPath", "testBasicTableImport", "-outputStrategy", "delimited", "-delimiter", "DELIM",
            "-numberOfMappers", "2", "-inputTableName", "Pageview", "-inputOrderBy", "pageview" };
    int res = ToolRunner.run(createJobConf(), job, args);
    assertEquals(0, res);/*www. java2  s.c  o m*/
    //lets verify the result now
    FileSystem outputFS = getFileSystem();
    //Path outputPath = getOutputDir();

    Path outputPath = new Path(outputFS.getHomeDirectory(), "testBasicTableImport");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("/aDELIM1000");
    expectedOutput.add("/bDELIM2000");
    expectedOutput.add("/cDELIM3000");
    expectedOutput.add("/dDELIM4000");
    expectedOutput.add("/eDELIM5000");
    expectedOutput.add("/fDELIM6000");
    expectedOutput.add("/gDELIM7000");
    expectedOutput.add("/hDELIM8000");
    expectedOutput.add("/iDELIM9000");
    expectedOutput.add("/jDELIM10000");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(10, count);
}

From source file:co.nubetech.hiho.job.TestDBQueryInputJobWithCluster.java

License:Apache License

@Test
public void testBasicAvroTableImport() throws Exception {
    DBQueryInputJob job = new DBQueryInputJob();

    String[] args = new String[] { "-jdbcDriver", "org.hsqldb.jdbcDriver", "-jdbcUrl",
            "jdbc:hsqldb:hsql://localhost/URLAccess", "-outputPath", "testQueryBasedImport", "-inputQuery",
            "select url,pageview,commentCount from Pageview, PageComment where Pageview.url = PageComment.url",
            "-inputBoundingQuery", "select min(commentCount), max(commentCount) from PageComment",
            "-outputStrategy", "AVRO", "-delimiter", "DELIM", "-numberOfMappers", "2", "-inputOrderBy",
            "Pageview.pageview" };
    int res = ToolRunner.run(createJobConf(), job, args);
    assertEquals(0, res);//from ww  w . j  av a2  s. com
    //lets verify the result now
    FileSystem outputFS = getFileSystem();
    //Path outputPath = getOutputDir();

    Path outputPath = new Path(outputFS.getHomeDirectory(), "testBasicTableImport");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    /*   List<String> expectedOutput = new ArrayList<String>();
       expectedOutput.add("/aDELIM1000");
       expectedOutput.add("/bDELIM2000");
       expectedOutput.add("/cDELIM3000");
       expectedOutput.add("/dDELIM4000");
       expectedOutput.add("/eDELIM5000");
       expectedOutput.add("/fDELIM6000");
       expectedOutput.add("/gDELIM7000");
       expectedOutput.add("/hDELIM8000");
       expectedOutput.add("/iDELIM9000");
       expectedOutput.add("/jDELIM10000");
       int count = 0;
       for (FileStatus fileStat: status) {
          logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
          FSDataInputStream in = outputFS.open(fileStat.getPath());
          String line = null;         
          while ((line = in.readLine()) != null) {
    logger.debug("Output is " + line);
    assertTrue("Matched output " + line , expectedOutput.contains(line));
    expectedOutput.remove(line);
    count++;
          }
          in.close();
       }
       assertEquals(10, count);   */
}

From source file:co.nubetech.hiho.job.TestDBQueryInputJobWithCluster.java

License:Apache License

@Test
public void testQueryBasedImport() throws Exception {
    DBQueryInputJob job = new DBQueryInputJob();

    String[] args = new String[] { "-jdbcDriver", "org.hsqldb.jdbcDriver", "-jdbcUrl",
            "jdbc:hsqldb:hsql://localhost/URLAccess", "-outputPath", "testQueryBasedImport", "-inputQuery",
            "select url,pageview,commentCount from Pageview, PageComment where Pageview.url = PageComment.url",
            "-inputBoundingQuery", "select min(commentCount), max(commentCount) from PageComment",
            "-outputStrategy", "delimited", "-delimiter", "DELIM", "-numberOfMappers", "2", "-inputOrderBy",
            "Pageview.pageview" };
    int res = ToolRunner.run(createJobConf(), job, args);
    assertEquals(0, res);//from  w w w.  j a v a 2s. c  o  m
    //lets verify the result now
    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "testQueryBasedImport");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("/aDELIM1000DELIM10");
    expectedOutput.add("/bDELIM2000DELIM10");
    expectedOutput.add("/cDELIM3000DELIM10");
    expectedOutput.add("/dDELIM4000DELIM10");
    expectedOutput.add("/eDELIM5000DELIM10");
    expectedOutput.add("/fDELIM6000DELIM10");
    expectedOutput.add("/gDELIM7000DELIM10");
    expectedOutput.add("/hDELIM8000DELIM10");
    expectedOutput.add("/iDELIM9000DELIM10");
    expectedOutput.add("/jDELIM10000DELIM10");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(10, count);
}

From source file:co.nubetech.hiho.merge.TestMergeJob.java

License:Apache License

@Test
public void testMergeByCustomObjectKeyWithSequenceFileInputFormat() throws Exception {
    Student student1 = setStudent(new Text("Sam"), new Text("US"), new IntWritable(1),
            new LongWritable(9999999998l), new DoubleWritable(99.12));
    Student student2 = setStudent(new Text("John"), new Text("AUS"), new IntWritable(2),
            new LongWritable(9999999999l), new DoubleWritable(90.12));
    Student student3 = setStudent(new Text("Mary"), new Text("UK"), new IntWritable(3),
            new LongWritable(9999999988l), new DoubleWritable(69.12));
    Student student4 = setStudent(new Text("Kelvin"), new Text("UK"), new IntWritable(4),
            new LongWritable(9999998888l), new DoubleWritable(59.12));

    HashMap<Student, Text> inputData1 = new HashMap<Student, Text>();
    inputData1.put(student1, new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(student2, new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(student3, new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<Student, Text> inputData2 = new HashMap<Student, Text>();
    inputData2.put(student2, new Text("Austin Farley,4794 Donec Ave,1-230-823-8164,13508"));
    inputData2.put(student3, new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(student4, new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "key",
            "-outputPath", "output", "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputKeyClassName",
            "co.nubetech.hiho.testdata.Student", "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job = runMergeJobs(args);/*from  w  ww  . j  a  va  2s.  co  m*/
    assertEquals(3, job.getTotalRecordsNew());
    assertEquals(3, job.getTotalRecordsOld());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<Student> expectedOutput = new ArrayList<Student>();
    expectedOutput.add(student1);
    expectedOutput.add(student2);
    expectedOutput.add(student3);
    expectedOutput.add(student4);
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.merge.TestMergeJob.java

License:Apache License

@Test
public void testMergeByIntWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1), new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(new IntWritable(2), new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(new IntWritable(3), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1), new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(new IntWritable(2), new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    inputData2.put(new IntWritable(4), new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-newPath", "/input1", "-oldPath", "/input2", "-mergeBy", "key",
            "-outputPath", "output", "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputKeyClassName",
            "org.apache.hadoop.io.IntWritable", "-inputValueClassName", "org.apache.hadoop.io.Text" };
    MergeJob job = runMergeJobs(args);//from w w w  . j  ava2  s.  c  o  m
    assertEquals(3, job.getTotalRecordsNew());
    assertEquals(3, job.getTotalRecordsOld());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<IntWritable> expectedOutput = new ArrayList<IntWritable>();
    expectedOutput.add(new IntWritable(1));
    expectedOutput.add(new IntWritable(2));
    expectedOutput.add(new IntWritable(3));
    expectedOutput.add(new IntWritable(4));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);

}