Example usage for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl.

Prototype

public TaskAttemptContextImpl(Configuration conf, TaskAttemptID taskId)

Source Link

Usage

From source file:edu.umn.cs.spatialHadoop.operations.Head.java

License:Open Source License

private static TaskAttemptContext createDummyContext() {
    TaskAttemptID taskId = new TaskAttemptID();
    return new TaskAttemptContextImpl(new Configuration(), taskId);
}

From source file:example.TestLineRecordReader.java

License:Apache License

private void testSplitRecordsForFile(Configuration conf, long firstSplitLength, long testFileSize,
        Path testFilePath) throws IOException {
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    assertTrue("unexpected test data at " + testFilePath, testFileSize > firstSplitLength);

    String delimiter = conf.get("textinputformat.record.delimiter");
    byte[] recordDelimiterBytes = null;
    if (null != delimiter) {
        recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    }//from   www.j a v a2 s  . c o  m
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    // read the data without splitting to count the records
    FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[]) null);
    LineRecordReader reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    int numRecordsNoSplits = 0;
    while (reader.nextKeyValue()) {
        ++numRecordsNoSplits;
    }
    reader.close();

    // count the records in the first split
    split = new FileSplit(testFilePath, 0, firstSplitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    int numRecordsFirstSplit = 0;
    while (reader.nextKeyValue()) {
        ++numRecordsFirstSplit;
    }
    reader.close();

    // count the records in the second split
    split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    int numRecordsRemainingSplits = 0;
    while (reader.nextKeyValue()) {
        ++numRecordsRemainingSplits;
    }
    reader.close();
    assertEquals("Unexpected number of records in split ", numRecordsNoSplits,
            numRecordsFirstSplit + numRecordsRemainingSplits);
}

From source file:example.TestLineRecordReader.java

License:Apache License

public ArrayList<String> readRecords(URL testFileUrl, int splitSize) throws IOException {

    // Set up context
    File testFile = new File(testFileUrl.getFile());
    long testFileSize = testFile.length();
    Path testFilePath = new Path(testFile.getAbsolutePath());
    Configuration conf = new Configuration();
    conf.setInt("io.file.buffer.size", 1);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    // Gather the records returned by the record reader
    ArrayList<String> records = new ArrayList<String>();

    long offset = 0;
    while (offset < testFileSize) {
        FileSplit split = new FileSplit(testFilePath, offset, splitSize, null);
        LineRecordReader reader = new LineRecordReader();
        reader.initialize(split, context);

        while (reader.nextKeyValue()) {
            records.add(reader.getCurrentValue().toString());
        }//  www . j  a  v a  2  s  .  c om
        offset += splitSize;
    }
    return records;
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testStripBOM() throws IOException {
    // the test data contains a BOM at the start of the file
    // confirm the BOM is skipped by LineRecordReader
    String UTF8_BOM = "\uFEFF";
    URL testFileUrl = getClass().getClassLoader().getResource("testBOM.txt");
    assertNotNull("Cannot find testBOM.txt", testFileUrl);
    File testFile = new File(testFileUrl.getFile());
    Path testFilePath = new Path(testFile.getAbsolutePath());
    long testFileSize = testFile.length();
    Configuration conf = new Configuration();
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    // read the data and check whether BOM is skipped
    FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[]) null);
    LineRecordReader reader = new LineRecordReader();
    reader.initialize(split, context);//from  w  ww  .j  a v a  2 s. c  o  m
    int numRecords = 0;
    boolean firstLine = true;
    boolean skipBOM = true;
    while (reader.nextKeyValue()) {
        if (firstLine) {
            firstLine = false;
            if (reader.getCurrentValue().toString().startsWith(UTF8_BOM)) {
                skipBOM = false;
            }
        }
        ++numRecords;
    }
    reader.close();

    assertTrue("BOM is not skipped", skipBOM);
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testMultipleClose() throws IOException {
    URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2");
    assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl);
    File testFile = new File(testFileUrl.getFile());
    Path testFilePath = new Path(testFile.getAbsolutePath());
    long testFileSize = testFile.length();
    Configuration conf = new Configuration();
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    // read the data and check whether BOM is skipped
    FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null);
    LineRecordReader reader = new LineRecordReader();
    reader.initialize(split, context);/*  ww w  .j a  v  a 2 s .  c  o m*/

    //noinspection StatementWithEmptyBody
    while (reader.nextKeyValue())
        ;
    reader.close();
    reader.close();

    BZip2Codec codec = new BZip2Codec();
    codec.setConf(conf);
    Set<Decompressor> decompressors = new HashSet<Decompressor>();
    for (int i = 0; i < 10; ++i) {
        decompressors.add(CodecPool.getDecompressor(codec));
    }
    assertEquals(10, decompressors.size());
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testUncompressedInputCustomDelimiterPosValue() throws Exception {
    Configuration conf = new Configuration();
    conf.setInt("io.file.buffer.size", 10);
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    String inputData = "abcdefghij++kl++mno";
    Path inputFile = createInputFile(conf, inputData);
    String delimiter = "++";
    byte[] recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    int splitLength = 15;
    FileSplit split = new FileSplit(inputFile, 0, splitLength, (String[]) null);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    LineRecordReader reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);/*from  w  w w  .  j a  v  a  2 s .  c  o m*/
    // Get first record: "abcdefghij"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    LongWritable key = reader.getCurrentKey();
    Text value = reader.getCurrentValue();
    assertEquals("Wrong length for record value", 10, value.getLength());
    assertEquals("Wrong position after record read", 0, key.get());
    // Get second record: "kl"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    assertEquals("Wrong length for record value", 2, value.getLength());
    // Key should be 12 right after "abcdefghij++"
    assertEquals("Wrong position after record read", 12, key.get());
    // Get third record: "mno"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    assertEquals("Wrong length for record value", 3, value.getLength());
    // Key should be 16 right after "abcdefghij++kl++"
    assertEquals("Wrong position after record read", 16, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 19 right after "abcdefghij++kl++mno"
    assertEquals("Wrong position after record read", 19, key.get());
    // after refresh should be empty
    key = reader.getCurrentKey();
    assertNull("Unexpected key returned", key);
    reader.close();
    split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    // No record is in the second split because the second split dropped
    // the first record, which was already reported by the first split.
    assertFalse("Unexpected record returned", reader.nextKeyValue());
    key = reader.getCurrentKey();
    assertNull("Unexpected key returned", key);
    reader.close();

    // multi char delimiter with starting part of the delimiter in the data
    inputData = "abcd+efgh++ijk++mno";
    inputFile = createInputFile(conf, inputData);
    splitLength = 5;
    split = new FileSplit(inputFile, 0, splitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    // Get first record: "abcd+efgh"
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    assertEquals("Wrong position after record read", 0, key.get());
    assertEquals("Wrong length for record value", 9, value.getLength());
    // should have jumped over the delimiter, no record
    assertFalse(reader.nextKeyValue());
    assertEquals("Wrong position after record read", 11, key.get());
    // after refresh should be empty
    key = reader.getCurrentKey();
    assertNull("Unexpected key returned", key);
    reader.close();
    // next split: check for duplicate or dropped records
    split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null);
    reader = new LineRecordReader(recordDelimiterBytes);
    reader.initialize(split, context);
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get second record: "ijk" first in this split
    assertEquals("Wrong position after record read", 11, key.get());
    assertEquals("Wrong length for record value", 3, value.getLength());
    // Get third record: "mno" second in this split
    assertTrue("Expected record got nothing", reader.nextKeyValue());
    assertEquals("Wrong position after record read", 16, key.get());
    assertEquals("Wrong length for record value", 3, value.getLength());
    // should be at the end of the input
    assertFalse(reader.nextKeyValue());
    assertEquals("Wrong position after record read", 19, key.get());
    reader.close();

    inputData = "abcd|efgh|+|ij|kl|+|mno|pqr";
    inputFile = createInputFile(conf, inputData);
    delimiter = "|+|";
    recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    // walking over the buffer and split sizes checks for proper processing
    // of the ambiguous bytes of the delimiter
    for (int bufferSize = 1; bufferSize <= inputData.length(); bufferSize++) {
        for (int splitSize = 1; splitSize < inputData.length(); splitSize++) {
            // track where we are in the inputdata
            int keyPosition = 0;
            conf.setInt("io.file.buffer.size", bufferSize);
            split = new FileSplit(inputFile, 0, bufferSize, (String[]) null);
            reader = new LineRecordReader(recordDelimiterBytes);
            reader.initialize(split, context);
            // Get the first record: "abcd|efgh" always possible
            assertTrue("Expected record got nothing", reader.nextKeyValue());
            key = reader.getCurrentKey();
            value = reader.getCurrentValue();
            assertTrue("abcd|efgh".equals(value.toString()));
            // Position should be 0 right at the start
            assertEquals("Wrong position after record read", keyPosition, key.get());
            // Position should be 12 right after the first "|+|"
            keyPosition = 12;
            // get the next record: "ij|kl" if the split/buffer allows it
            if (reader.nextKeyValue()) {
                // check the record info: "ij|kl"
                assertTrue("ij|kl".equals(value.toString()));
                assertEquals("Wrong position after record read", keyPosition, key.get());
                // Position should be 20 after the second "|+|"
                keyPosition = 20;
            }
            // get the third record: "mno|pqr" if the split/buffer allows it
            if (reader.nextKeyValue()) {
                // check the record info: "mno|pqr"
                assertTrue("mno|pqr".equals(value.toString()));
                assertEquals("Wrong position after record read", keyPosition, key.get());
                // Position should be the end of the input
                keyPosition = inputData.length();
            }
            assertFalse("Unexpected record returned", reader.nextKeyValue());
            // no more records can be read we should be at the last position
            assertEquals("Wrong position after record read", keyPosition, key.get());
            // after refresh should be empty
            key = reader.getCurrentKey();
            assertNull("Unexpected key returned", key);
            reader.close();
        }
    }
}

From source file:example.TestLineRecordReader.java

License:Apache License

@Test
public void testUncompressedInputDefaultDelimiterPosValue() throws Exception {
    Configuration conf = new Configuration();
    String inputData = "1234567890\r\n12\r\n345";
    Path inputFile = createInputFile(conf, inputData);
    conf.setInt("io.file.buffer.size", 10);
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    LineRecordReader reader = new LineRecordReader(null);
    reader.initialize(split, context);/*from ww w .  j  av  a  2  s  .co  m*/
    LongWritable key;
    Text value;
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get first record:"1234567890"
    assertEquals(10, value.getLength());
    assertEquals(0, key.get());
    reader.nextKeyValue();
    // Get second record:"12"
    assertEquals(2, value.getLength());
    // Key should be 12 right after "1234567890\r\n"
    assertEquals(12, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 16 right after "1234567890\r\n12\r\n"
    assertEquals(16, key.get());

    split = new FileSplit(inputFile, 15, 4, (String[]) null);
    reader = new LineRecordReader(null);
    reader.initialize(split, context);
    // The second split dropped the first record "\n"
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get third record:"345"
    assertEquals(3, value.getLength());
    // Key should be 16 right after "1234567890\r\n12\r\n"
    assertEquals(16, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 19 right after "1234567890\r\n12\r\n345"
    assertEquals(19, key.get());

    inputData = "123456789\r\r\n";
    inputFile = createInputFile(conf, inputData);
    split = new FileSplit(inputFile, 0, 12, (String[]) null);
    reader = new LineRecordReader(null);
    reader.initialize(split, context);
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get first record:"123456789"
    assertEquals(9, value.getLength());
    assertEquals(0, key.get());
    reader.nextKeyValue();
    // Get second record:""
    assertEquals(0, value.getLength());
    // Key should be 10 right after "123456789\r"
    assertEquals(10, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 12 right after "123456789\r\r\n"
    assertEquals(12, key.get());
}

From source file:gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormatTest.java

License:Apache License

@Test
public void testRecordReader() throws Exception {

    List<String> paths = Lists.newArrayList("/path1", "/path2");
    GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);

    GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
    RecordReader<LongWritable, Text> recordReader = inputFormat.createRecordReader(split,
            new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1, TaskType.MAP, 1, 1)));

    recordReader.nextKeyValue();//from  ww w  . j  a v  a 2  s.  co m
    Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
    Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");

    recordReader.nextKeyValue();
    Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
    Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");

    Assert.assertFalse(recordReader.nextKeyValue());

}

From source file:info.halo9pan.word2vec.hadoop.mr.SortInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*  w ww  . java 2 s  .c  o m*/
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final SortInputFormat inFormat = new SortInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:io.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser();

    reader.initialize(split, context);/*  ww w  .  j av  a  2 s.  co m*/

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parse(data);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(new DateTime(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}