Example usage for org.apache.hadoop.io Text Text

List of usage examples for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text(byte[] utf8) 

Source Link

Document

Construct from a byte array.

Usage

From source file:co.cask.tephra.persist.HDFSTransactionLogTest.java

License:Apache License

private SequenceFile.Writer getSequenceFileWriter(Configuration configuration, FileSystem fs, long timeInMillis,
        boolean withMarker) throws IOException {
    String snapshotDir = configuration.get(TxConstants.Manager.CFG_TX_SNAPSHOT_DIR);
    Path newLog = new Path(snapshotDir, LOG_FILE_PREFIX + timeInMillis);
    SequenceFile.Metadata metadata = new SequenceFile.Metadata();
    if (withMarker) {
        metadata.set(new Text(TxConstants.TransactionLog.VERSION_KEY),
                new Text(Byte.toString(TxConstants.TransactionLog.CURRENT_VERSION)));
    }//from w  ww.  ja  va  2 s .c  o  m
    return SequenceFile.createWriter(fs, configuration, newLog, LongWritable.class, TransactionEdit.class,
            SequenceFile.CompressionType.NONE, null, null, metadata);
}

From source file:co.nubetech.hiho.dedup.HihoTuple.java

License:Apache License

public void setKey(K key) throws IOException {
    this.key = key;
    this.hash = HashUtility.getMD5Hash(key);
    this.keyClass = new Text(key.getClass().getName());
    logger.debug("Key is: " + this.getKey());
    logger.debug("Hash is: " + this.getHash());
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByIntWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new IntWritable(2),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new IntWritable(3),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new IntWritable(2),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new IntWritable(4),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
            "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "key" };
    DedupJob job = runDedupJob(args);/*from   w w w  . j av  a 2s  . c om*/
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());
    assertEquals(2, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<IntWritable> expectedOutput = new ArrayList<IntWritable>();
    expectedOutput.add(new IntWritable(1));
    expectedOutput.add(new IntWritable(2));
    expectedOutput.add(new IntWritable(3));
    expectedOutput.add(new IntWritable(4));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithSequenceFileInputFormat() throws Exception {
    HashMap<IntWritable, Text> inputData1 = new HashMap<IntWritable, Text>();
    inputData1.put(new IntWritable(1),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new IntWritable(2),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new IntWritable(3),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<IntWritable, Text> inputData2 = new HashMap<IntWritable, Text>();
    inputData2.put(new IntWritable(1),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new IntWritable(2),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new IntWritable(4),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.IntWritable",
            "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);/* w  w  w.  j a  v  a  2  s  . c  o  m*/
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<Text> expectedOutput = new ArrayList<Text>();
    expectedOutput.add(new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    expectedOutput.add(new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    expectedOutput.add(new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    expectedOutput.add(new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    expectedOutput.add(new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableValue, expectedOutput.contains(writableValue));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(5, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByLongWritableKeyWithSequenceFileInputFormat() throws Exception {
    HashMap<LongWritable, Text> inputData1 = new HashMap<LongWritable, Text>();
    inputData1.put(new LongWritable(1),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new LongWritable(2),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new LongWritable(3),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<LongWritable, Text> inputData2 = new HashMap<LongWritable, Text>();
    inputData2.put(new LongWritable(1),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new LongWritable(2),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new LongWritable(4),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.LongWritable",
            "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "key" };
    DedupJob job = runDedupJob(args);//from   w ww .j a  v a2  s.c o m
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());
    assertEquals(2, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<LongWritable> expectedOutput = new ArrayList<LongWritable>();
    expectedOutput.add(new LongWritable(1));
    expectedOutput.add(new LongWritable(2));
    expectedOutput.add(new LongWritable(3));
    expectedOutput.add(new LongWritable(4));
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByValueWithSequenceFileAsTextInputFormat() throws Exception {
    HashMap<Text, Text> inputData1 = new HashMap<Text, Text>();
    inputData1.put(new Text("1"),
            new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"));
    inputData1.put(new Text("2"),
            new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"));
    inputData1.put(new Text("3"),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<Text, Text> inputData2 = new HashMap<Text, Text>();
    inputData2.put(new Text("1"),
            new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"));
    inputData2.put(new Text("2"),
            new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"));
    inputData2.put(new Text("4"),
            new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputFormat",
            "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-outputFormat",
            "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-inputPath", "/input1,/input2",
            "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName",
            "org.apache.hadoop.io.Text", "-dedupBy", "value" };
    DedupJob job = runDedupJob(args);//from www. j  av a2  s  . c  o  m
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(5, job.getOutput());
    assertEquals(1, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output");
    FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter());
    assertTrue(outputFS.exists(outputPath));
    List<String> expectedOutput = new ArrayList<String>();
    expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney");
    expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson");
    expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein");
    expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos");
    expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson");
    int count = 0;
    for (FileStatus fileStat : status) {
        logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory());
        FSDataInputStream in = outputFS.open(fileStat.getPath());
        String line = null;
        while ((line = in.readLine()) != null) {
            logger.debug("Output is " + line);
            assertTrue("Matched output " + line, expectedOutput.contains(line));
            expectedOutput.remove(line);
            count++;
        }
        in.close();
    }
    assertEquals(5, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupJob.java

License:Apache License

@Test
public void testDedupByCustomObjectKeyWithSequenceFileInputFormat() throws Exception {
    Student student1 = setStudent(new Text("Sam"), new Text("US"), new IntWritable(1),
            new LongWritable(9999999998l), new DoubleWritable(99.12));
    Student student2 = setStudent(new Text("John"), new Text("AUS"), new IntWritable(2),
            new LongWritable(9999999999l), new DoubleWritable(90.12));
    Student student3 = setStudent(new Text("Mary"), new Text("UK"), new IntWritable(3),
            new LongWritable(9999999988l), new DoubleWritable(69.12));
    Student student4 = setStudent(new Text("Kelvin"), new Text("UK"), new IntWritable(4),
            new LongWritable(9999998888l), new DoubleWritable(59.12));

    HashMap<Student, Text> inputData1 = new HashMap<Student, Text>();
    inputData1.put(student1, new Text("Macon Kent,6269 Aenean St.,1-247-399-1051,08253"));
    inputData1.put(student2, new Text("Dale Zamora,521-7792 Mauris Rd.,1-214-625-6970,90510"));
    inputData1.put(student3, new Text("Charles Wood,525-9709 In Rd.,1-370-528-4758,62714"));
    createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq");

    HashMap<Student, Text> inputData2 = new HashMap<Student, Text>();
    inputData2.put(student2, new Text("Austin Farley,4794 Donec Ave,1-230-823-8164,13508"));
    inputData2.put(student3, new Text("Macaulay Jackson,5435 Dui. Avenue,1-770-395-6446,31584"));
    inputData2.put(student4, new Text("Timon Leonard,716 Ac Ave,1-857-935-3882,62240"));
    createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq");

    String[] args = new String[] { "-inputPath", "/input1,/input2", "-outputPath", "output", "-dedupBy", "key",
            "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
            "-inputKeyClassName", "co.nubetech.hiho.testdata.Student", "-inputValueClassName",
            "org.apache.hadoop.io.Text" };
    DedupJob job = runDedupJob(args);//  w  w w  . j  av  a  2  s.  c  o m
    assertEquals(6, job.getTotalRecordsRead());
    assertEquals(0, job.getBadRecords());
    assertEquals(4, job.getOutput());
    assertEquals(2, job.getDuplicateRecords());

    FileSystem outputFS = getFileSystem();
    Path outputPath = new Path(outputFS.getHomeDirectory(), "output/part-r-00000");
    Configuration conf = new Configuration();
    SequenceFile.Reader reader = new SequenceFile.Reader(outputFS, outputPath, conf);
    Writable writableKey = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
    Writable writableValue = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
    List<Student> expectedOutput = new ArrayList<Student>();
    expectedOutput.add(student1);
    expectedOutput.add(student2);
    expectedOutput.add(student3);
    expectedOutput.add(student4);
    int count = 0;
    while (reader.next(writableKey, writableValue)) {
        logger.debug("key and value is: " + writableKey + ", " + writableValue);
        assertTrue("Matched output " + writableKey, expectedOutput.contains(writableKey));
        count++;
    }
    IOUtils.closeStream(reader);
    assertEquals(4, count);
}

From source file:co.nubetech.hiho.dedup.TestDedupKeyMapper.java

License:Apache License

@Test
public final void testMapperValidValues() throws IOException, InterruptedException {
    Mapper.Context context = mock(Mapper.Context.class);
    Counters counters = new Counters();
    Counter counter = counters.findCounter(DedupRecordCounter.TOTAL_RECORDS_READ);
    when(context.getCounter(DedupRecordCounter.TOTAL_RECORDS_READ)).thenReturn(counter);
    DedupKeyMapper<Text, String> mapper = new DedupKeyMapper<Text, String>();
    Text key = new Text("abc123");
    String val = "valueOfKey";
    mapper.map(key, val, context);

    HihoTuple<Text> hihoTuple = new HihoTuple<Text>();
    hihoTuple.setKey(key);//w ww .j a va 2 s  .  c o m
    verify(context).write(hihoTuple, val);
    assertEquals(1, context.getCounter(DedupRecordCounter.TOTAL_RECORDS_READ).getValue());
}

From source file:co.nubetech.hiho.dedup.TestDedupKeyReducer.java

License:Apache License

@Test
public void testReducerValidValues() throws IOException, InterruptedException {
    Text key = new Text("key123");
    HihoTuple hihoTuple = new HihoTuple();
    hihoTuple.setKey(key);//from w w w .jav a 2 s.c o m

    String value1 = new String("value1");
    String value2 = new String("value2");
    String value3 = new String("value3");
    ArrayList<String> values = new ArrayList<String>();
    values.add(value1);
    values.add(value2);
    values.add(value3);

    Reducer.Context context = mock(Reducer.Context.class);
    Counters counters = new Counters();
    Counter counter = counters.findCounter(DedupRecordCounter.OUTPUT);
    when(context.getCounter(DedupRecordCounter.OUTPUT)).thenReturn(counter);
    DedupKeyReducer dedupReducer = new DedupKeyReducer();
    dedupReducer.reduce(hihoTuple, values, context);
    verify(context).write(key, value1);
    assertEquals(1, context.getCounter(DedupRecordCounter.OUTPUT).getValue());
}

From source file:co.nubetech.hiho.dedup.TestDedupKeyReducer.java

License:Apache License

@Test
public void testReducerForLongWritableKey() throws IOException, InterruptedException {
    LongWritable key = new LongWritable(Long.parseLong("123"));
    HihoTuple hihoTuple = new HihoTuple();
    hihoTuple.setKey(key);//from w  ww.j ava2 s  .c  om

    Text value1 = new Text("value1");
    ArrayList<Text> values = new ArrayList<Text>();
    values.add(value1);

    Reducer.Context context = mock(Reducer.Context.class);
    Counters counters = new Counters();
    Counter counter = counters.findCounter(DedupRecordCounter.OUTPUT);
    when(context.getCounter(DedupRecordCounter.OUTPUT)).thenReturn(counter);
    DedupKeyReducer dedupReducer = new DedupKeyReducer();
    dedupReducer.reduce(hihoTuple, values, context);
    verify(context).write(key, value1);
    assertEquals(1, context.getCounter(DedupRecordCounter.OUTPUT).getValue());
}