Example usage for org.apache.hadoop.mapreduce RecordReader initialize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader initialize.

Prototype

public abstract void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Called once at initialization.

Usage

From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java

License:Open Source License

/**
 * Generate random data, compress it, index and md5 hash the data.
 * Then read it all back and md5 that too, to verify that it all went ok.
 * /* w  w w  .  java  2s . c o m*/
 * @param testWithIndex Should we index or not?
 * @param charsToOutput How many characters of random data should we output.
 * @throws IOException
 * @throws NoSuchAlgorithmException
 * @throws InterruptedException
 */
private void runTest(boolean testWithIndex, int charsToOutput)
        throws IOException, NoSuchAlgorithmException, InterruptedException {

    if (!GPLNativeCodeLoader.isNativeCodeLoaded()) {
        LOG.warn("Cannot run this test without the native lzo libraries");
        return;
    }

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir, true);
    localFs.mkdirs(outputDir);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
            new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
        Path lzoFile = new Path(outputDir, lzoFileName);
        LzoTextInputFormat.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir);

    List<InputSplit> is = inputFormat.getSplits(job);
    //verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
        assertEquals(3, is.size());
    } else {
        assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
        RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext);
        rr.initialize(inputSplit, attemptContext);

        while (rr.nextKeyValue()) {
            Text value = rr.getCurrentValue();

            md5.update(value.getBytes(), 0, value.getLength());
        }

        rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5.digest()));
}

From source file:com.inmobi.conduit.distcp.tools.mapred.lib.TestDynamicInputFormat.java

License:Apache License

@Test
public void testGetSplits() throws Exception {
    DistCpOptions options = getOptions();
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(
            new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testDynInputFormat/fileList.seq"),
            options);/*w w w . jav a 2 s .c  om*/

    JobID jobId = new JobID();
    JobContext jobContext = mock(JobContext.class);
    when(jobContext.getConfiguration()).thenReturn(configuration);
    when(jobContext.getJobID()).thenReturn(jobId);
    DynamicInputFormat<Text, FileStatus> inputFormat = new DynamicInputFormat<Text, FileStatus>();
    List<InputSplit> splits = inputFormat.getSplits(jobContext);

    int nFiles = 0;
    int taskId = 0;

    for (InputSplit split : splits) {
        TaskAttemptID tId = new TaskAttemptID("", 0, true, taskId, 0);
        final TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class);
        when(taskAttemptContext.getConfiguration()).thenReturn(configuration);
        when(taskAttemptContext.getTaskAttemptID()).thenReturn(tId);
        RecordReader<Text, FileStatus> recordReader = inputFormat.createRecordReader(split, taskAttemptContext);
        recordReader.initialize(splits.get(0), taskAttemptContext);
        float previousProgressValue = 0f;
        while (recordReader.nextKeyValue()) {
            FileStatus fileStatus = recordReader.getCurrentValue();
            String source = fileStatus.getPath().toString();
            System.out.println(source);
            Assert.assertTrue(expectedFilePaths.contains(source));
            final float progress = recordReader.getProgress();
            Assert.assertTrue(progress >= previousProgressValue);
            Assert.assertTrue(progress >= 0.0f);
            Assert.assertTrue(progress <= 1.0f);
            previousProgressValue = progress;
            ++nFiles;
        }
        Assert.assertTrue(recordReader.getProgress() == 1.0f);

        ++taskId;
    }

    Assert.assertEquals(expectedFilePaths.size(), nFiles);
}

From source file:com.inmobi.conduit.distcp.tools.mapred.TestUniformSizeInputFormat.java

License:Apache License

public void testGetSplits(int nMaps) throws Exception {
    DistCpOptions options = getOptions(nMaps);
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    Path listFile = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testGetSplits_1/fileList.seq");
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(listFile, options);

    JobContext jobContext = Mockito.mock(JobContext.class);
    Mockito.when(jobContext.getConfiguration()).thenReturn(configuration);
    Mockito.when(jobContext.getJobID()).thenReturn(new JobID());
    UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat();
    List<InputSplit> splits = uniformSizeInputFormat.getSplits(jobContext);

    //Removing the legacy check - Refer HADOOP-9230
    int sizePerMap = totalFileSize / nMaps;

    checkSplits(listFile, splits);/*  ww  w  . j a  va 2s .co m*/

    int doubleCheckedTotalSize = 0;
    int previousSplitSize = -1;
    for (int i = 0; i < splits.size(); ++i) {
        InputSplit split = splits.get(i);
        int currentSplitSize = 0;
        TaskAttemptID taskId = new TaskAttemptID("", 0, true, 0, 0);
        final TaskAttemptContext taskAttemptContext = Mockito.mock(TaskAttemptContext.class);
        Mockito.when(taskAttemptContext.getConfiguration()).thenReturn(configuration);
        Mockito.when(taskAttemptContext.getTaskAttemptID()).thenReturn(taskId);
        RecordReader<Text, FileStatus> recordReader = uniformSizeInputFormat.createRecordReader(split,
                taskAttemptContext);
        recordReader.initialize(split, taskAttemptContext);
        while (recordReader.nextKeyValue()) {
            Path sourcePath = recordReader.getCurrentValue().getPath();
            FileSystem fs = sourcePath.getFileSystem(configuration);
            FileStatus fileStatus[] = fs.listStatus(sourcePath);
            Assert.assertEquals(fileStatus.length, 1);
            currentSplitSize += fileStatus[0].getLen();
        }
        Assert.assertTrue(previousSplitSize == -1
                || Math.abs(currentSplitSize - previousSplitSize) < 0.1 * sizePerMap || i == splits.size() - 1);

        doubleCheckedTotalSize += currentSplitSize;
    }

    Assert.assertEquals(totalFileSize, doubleCheckedTotalSize);
}

From source file:com.metamx.milano.hadoop.MilanoProtoFileInputFormatTests.java

License:Apache License

@Test
public void testReadFile() throws Exception {
    MilanoProtoFileInputFormat inputFormat = new MilanoProtoFileInputFormat();

    FileSplit split = new FileSplit(readFile, 0, protoTestObjects.getFs().getFileStatus(readFile).getLen(),
            null);//from  w  w w  .ja v  a  2  s  .co m
    org.apache.hadoop.mapreduce.RecordReader<String, Message> recordReader = inputFormat
            .createRecordReader(split, protoTestObjects.getContext());
    recordReader.initialize(split, protoTestObjects.getContext());

    for (int i = 0; i < protoTestObjects.getTestItems().size(); i++) {
        Assert.assertTrue("Fewer objects than expected.", recordReader.nextKeyValue());
        Message message = recordReader.getCurrentValue();

        protoTestObjects.compareMessages(protoTestObjects.getTestItem(i), message);
    }

    recordReader.close();
}

From source file:com.metamx.milano.hadoop.MilanoProtoFileInputFormatTests.java

License:Apache License

@Test
public void testReadFileNoMetadata() throws Exception {
    MilanoProtoFileInputFormat inputFormat = new MilanoProtoFileInputFormat();
    inputFormat.setBuilder(Testing.TestItem.newBuilder());

    FileSplit split = new FileSplit(readFile, 0, protoTestObjects.getFs().getFileStatus(readFile).getLen(),
            null);//from www . j  ava2 s.  com
    org.apache.hadoop.mapreduce.RecordReader<String, Message> recordReader = inputFormat
            .createRecordReader(split, protoTestObjects.getContext());
    recordReader.initialize(split, protoTestObjects.getContext());

    for (int i = 0; i < protoTestObjects.getTestItems().size(); i++) {
        Assert.assertTrue("Fewer objects than expected.", recordReader.nextKeyValue());
        Message message = recordReader.getCurrentValue();

        protoTestObjects.compareMessages(protoTestObjects.getTestItem(i), message);
    }

    recordReader.close();
}

From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*from w  w w.  j  av a  2  s  .  co  m*/
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.scaleoutsoftware.soss.hserver.hadoop.MapperWrapperMapreduce.java

License:Apache License

/**
 * Runs mapper for the single split.//  ww  w  . j a va2  s  .c o m
 *
 * @param mapOutputAccumulator mapOutputAccumulator to use
 * @param split    split ot run on
 */

@Override
@SuppressWarnings("unchecked")
public void runSplit(MapOutputAccumulator<OUTKEY, OUTVALUE> mapOutputAccumulator, Object split, int splitIndex)
        throws IOException, ClassNotFoundException, InterruptedException {

    TaskAttemptID taskAttemptId = hadoopVersionSpecificCode.createTaskAttemptId(jobId, true, splitIndex);
    //Setup task ID info
    TaskAttemptContext taskContext = hadoopVersionSpecificCode.createTaskAttemptContext(configuration,
            taskAttemptId);

    InputFormat inputFormat = ReflectionUtils.newInstance(jobContext.getInputFormatClass(), configuration);

    //Create RecordReader
    org.apache.hadoop.mapreduce.RecordReader<INKEY, INVALUE> input = inputFormat
            .createRecordReader((InputSplit) split, taskContext);

    //Make a mapper
    org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE> mapper;
    try {
        mapper = (org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>) mapperConstructor
                .newInstance();
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    org.apache.hadoop.mapreduce.RecordWriter output;
    OutputCommitter committer = null;
    if (mapOnlyJob) {
        OutputFormat outputFormat = ReflectionUtils.newInstance(jobContext.getOutputFormatClass(),
                configuration);
        output = (org.apache.hadoop.mapreduce.RecordWriter<OUTKEY, OUTVALUE>) outputFormat
                .getRecordWriter(taskContext);
        committer = outputFormat.getOutputCommitter(taskContext);
        committer.setupTask(taskContext);
    } else {
        output = new MapOutputCollector<OUTKEY, OUTVALUE>(mapOutputAccumulator);
    }

    input.initialize((InputSplit) split, taskContext);

    org.apache.hadoop.mapreduce.Mapper<INKEY, INVALUE, OUTKEY, OUTVALUE>.Context mapperContext = hadoopVersionSpecificCode
            .getMapperContext(configuration, taskAttemptId, input, output);
    mapper.run(mapperContext);

    input.close();

    output.close(mapperContext);

    if (mapOnlyJob && committer != null) {
        committer.commitTask(taskContext);
    }
}

From source file:com.splout.db.hadoop.SchemaSampler.java

License:Apache License

public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat)
        throws IOException, InterruptedException {
    Schema schema = null;/*  www . j  a  v a  2  s. co  m*/

    // sample schema from input path given the provided InputFormat
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    FileInputFormat.setInputPaths(job, input);
    // get first inputSplit
    List<InputSplit> inputSplits = inputFormat.getSplits(job);
    if (inputSplits == null || inputSplits.size() == 0) {
        throw new IOException(
                "Given input format doesn't produce any input split. Can't sample first record. PATH: "
                        + input);
    }
    InputSplit inputSplit = inputSplits.get(0);
    TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
    TaskAttemptContext attemptContext;
    try {
        attemptContext = TaskAttemptContextFactory.get(conf, attemptId);
    } catch (Exception e) {
        throw new IOException(e);
    }

    RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext);
    rReader.initialize(inputSplit, attemptContext);

    if (!rReader.nextKeyValue()) {
        throw new IOException(
                "Can't read first record of first input split of the given path [" + input + "].");
    }

    // finally get the sample schema
    schema = rReader.getCurrentKey().getSchema();
    log.info("Sampled schema from [" + input + "] : " + schema);
    rReader.close();

    return schema;
}

From source file:com.splout.db.hadoop.TupleSampler.java

License:Apache License

/**
 * Random sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit
 * without using a Job.//from w  w  w .  ja  v  a  2s. c o m
 * The output is SequenceFile with keys.
 *
 * @return The number of retrieved samples
 */
private long randomSampling(long sampleSize, Configuration hadoopConf, Path outFile, List<InputSplit> splits,
        Map<InputSplit, TableSpec> splitToTableSpec,
        Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat,
        Map<InputSplit, Map<String, String>> specificHadoopConf,
        Map<InputSplit, RecordProcessor> recordProcessorPerSplit,
        Map<InputSplit, JavascriptEngine> splitToJsEngine, int maxSplitsToVisit) throws IOException {

    // Instantiate the writer we will write samples to
    FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf);

    if (splits.size() == 0) {
        throw new IllegalArgumentException("There are no splits to sample from!");
    }

    @SuppressWarnings("deprecation")
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, hadoopConf, outFile, Text.class,
            NullWritable.class);

    logger.info("Sequential sampling options, max splits to visit: " + maxSplitsToVisit + ", samples to take: "
            + sampleSize + ", total number of splits: " + splits.size());
    int blocks = Math.min(maxSplitsToVisit, splits.size());
    blocks = Math.min((int) sampleSize, blocks);
    long recordsPerSample = sampleSize / blocks;
    int sampleStep = splits.size() / blocks;

    long records = 0;

    CounterInterface counterInterface = new CounterInterface(null) {

        public Counter getCounter(String group, String name) {
            return Mockito.mock(Counter.class);
        }

        ;
    };

    // Take N samples from different parts of the input
    for (int i = 0; i < blocks; ++i) {
        TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);

        TaskAttemptContext attemptContext = null;
        try {
            attemptContext = TaskAttemptContextFactory.get(hadoopConf, attemptId);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        InputSplit split = splits.get(sampleStep * i);
        if (specificHadoopConf.get(split) != null) {
            for (Map.Entry<String, String> specificConf : specificHadoopConf.get(split).entrySet()) {
                attemptContext.getConfiguration().set(specificConf.getKey(), specificConf.getValue());
            }
        }
        logger.info("Sampling split: " + split);
        RecordReader<ITuple, NullWritable> reader = null;
        try {
            reader = splitToFormat.get(split).createRecordReader(split, attemptContext);
            reader.initialize(split, attemptContext);

            RecordProcessor processor = recordProcessorPerSplit.get(split);
            Text key = new Text();
            while (reader.nextKeyValue()) {
                //
                ITuple tuple = reader.getCurrentKey();

                ITuple uTuple;
                try {
                    uTuple = processor.process(tuple, tuple.getSchema().getName(), counterInterface);
                } catch (Throwable e) {
                    throw new RuntimeException(e);
                }
                if (uTuple != null) { // user may have filtered the record
                    try {
                        key.set(TablespaceGenerator.getPartitionByKey(uTuple, splitToTableSpec.get(split),
                                splitToJsEngine.get(split)));
                    } catch (Throwable e) {
                        throw new RuntimeException("Error when determining partition key.", e);
                    }

                    writer.append(key, NullWritable.get());
                    records += 1;
                    if ((i + 1) * recordsPerSample <= records) {
                        break;
                    }
                }
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }

    }

    writer.close();
    return records;
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
        throws IOException, InterruptedException {
    TextInputFormat textInputFormat = new TextInputFormat();
    InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null);
    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf,
            TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
    RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit,
            taskAttemptContext);//  w w  w. j ava  2s .c om
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean hasNext = recordReader.nextKeyValue();
    List<Map.Entry> batch = new ArrayList<>();
    while (hasNext && batch.size() < batchSize) {
        batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
                String.valueOf(recordReader.getCurrentValue())));
        hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
    }
    return batch;
}