Example usage for org.apache.hadoop.mapreduce RecordReader initialize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader initialize.

Prototype

public abstract void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Called once at initialization.

Usage

From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java

License:Apache License

@Test
public void testFormatStreamRecordReader() throws IOException, InterruptedException {
    File inputDir = tmpFolder.newFolder();
    File partition = new File(inputDir, "1.1000");
    partition.mkdirs();//from  w w  w  .  ja v a  2 s.  com
    File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
    File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());

    // write 1 event
    StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
            Files.newOutputStreamSupplier(indexFile), 100L);

    StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"),
            Charsets.UTF_8.encode("hello world"), 1000);
    writer.append(streamEvent);
    writer.close();

    FormatSpecification formatSpec = new FormatSpecification(TextRecordFormat.class.getName(),
            Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))),
            Collections.<String, String>emptyMap());
    Configuration conf = new Configuration();
    StreamInputFormat.setBodyFormatSpecification(conf, formatSpec);
    StreamInputFormat.setStreamPath(conf, inputDir.toURI());
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    StreamInputFormat format = new StreamInputFormat();

    // read all splits and store the results in the list
    List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList();
    List<InputSplit> inputSplits = format.getSplits(context);
    for (InputSplit split : inputSplits) {
        RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader = format
                .createRecordReader(split, context);
        recordReader.initialize(split, context);
        while (recordReader.nextKeyValue()) {
            recordsRead.add(recordReader.getCurrentValue());
        }
    }

    // should only have read 1 record
    Assert.assertEquals(1, recordsRead.size());
    GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0);
    Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders());
    Assert.assertEquals("hello world", eventData.getBody().get("body"));
}

From source file:co.cask.cdap.template.etl.common.ETLDBInputFormat.java

License:Apache License

@Override
protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException {
    final RecordReader dbRecordReader = super.createDBRecordReader(split, conf);
    return new RecordReader() {
        @Override//w  ww  . j a va  2s.co m
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            dbRecordReader.initialize(split, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return dbRecordReader.nextKeyValue();
        }

        @Override
        public Object getCurrentKey() throws IOException, InterruptedException {
            return dbRecordReader.getCurrentKey();
        }

        @Override
        public Object getCurrentValue() throws IOException, InterruptedException {
            return dbRecordReader.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return dbRecordReader.getProgress();
        }

        @Override
        public void close() throws IOException {
            dbRecordReader.close();
            try {
                DriverManager.deregisterDriver(driverShim);
            } catch (SQLException e) {
                throw new IOException(e);
            }
        }
    };
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java

License:Apache License

@Test
public void testOneFile() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile = new Path(dir, "file1.txt");

    writeSequenceFile(inputFile);/*from ww  w  .j  a  v a  2s. c o m*/

    Job job = new Job(new JobConf());

    FileInputFormat.addInputPath(job, inputFile);

    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size());

    TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0);
    Configuration conf1 = new Configuration();
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);

    RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1);
    rr.initialize(splits.get(0), context1);
    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertFalse(rr.nextKeyValue());
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java

License:Apache License

@Test
public void testTwoFiles() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile1 = new Path(dir, "file1.txt");
    Path inputFile2 = new Path(dir, "file2.txt");

    writeSequenceFile(inputFile1);/*  ww  w .  ja v  a2 s.  c  o  m*/
    writeSequenceFile(inputFile2);

    Job job = new Job(new JobConf());

    FileInputFormat.addInputPath(job, inputFile1);
    FileInputFormat.addInputPath(job, inputFile2);

    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size());

    TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0);
    Configuration conf1 = new Configuration();
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);

    RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1);
    rr.initialize(splits.get(0), context1);
    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertEquals(0.5f, rr.getProgress(), 0.1);

    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertFalse(rr.nextKeyValue());
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample//  w  w  w . j a  v a 2s .  c  o  m
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.HCatTupleInputFormat.java

License:Apache License

@Override
public RecordReader<ITuple, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext taskContext)
        throws IOException, InterruptedException {

    HCatInputFormat iF = new HCatInputFormat();

    @SuppressWarnings("rawtypes")
    final RecordReader<WritableComparable, HCatRecord> hCatRecordReader = iF.createRecordReader(split,
            taskContext);/* w  ww. jav  a  2 s. co m*/

    return new RecordReader<ITuple, NullWritable>() {

        ITuple tuple = new Tuple(pangoolSchema);

        @Override
        public void close() throws IOException {
            hCatRecordReader.close();
        }

        @Override
        public ITuple getCurrentKey() throws IOException, InterruptedException {
            HCatRecord record = hCatRecordReader.getCurrentValue();
            // Perform conversion between HCatRecord and Tuple
            for (int pos = 0; pos < schema.size(); pos++) {
                tuple.set(pos, record.get(pos));
            }
            return tuple;
        }

        @Override
        public NullWritable getCurrentValue() throws IOException, InterruptedException {
            return NullWritable.get();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return hCatRecordReader.getProgress();
        }

        @Override
        public void initialize(InputSplit iS, TaskAttemptContext context)
                throws IOException, InterruptedException {
            hCatRecordReader.initialize(iS, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return hCatRecordReader.nextKeyValue();
        }
    };
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java

License:Apache License

public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException,
        IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException,
        IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows);
    FileSystem fS = FileSystem.get(getConf());
    Random r = new Random(1);
    Schema schema = new Schema("schema", Fields.parse("i:int,s:string"));
    ITuple tuple = new Tuple(schema);

    Path outPath = new Path(OUT);
    TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema);
    for (int i = 0; i < generatedRows; i++) {
        tuple.set("i", r.nextInt());
        tuple.set("s", r.nextLong() + "");
        writer.append(tuple);//from  w w  w  .j av a  2 s.  co  m
    }
    writer.close();

    TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf());
    Job job = new Job(getConf());
    FileInputFormat.setInputPaths(job, outPath);
    logger.info("Using max input split size: " + maxSplitSize);
    FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
    job.setInputFormatClass(FileInputFormat.class);

    // Read all the splits and count. The number of read rows must
    // be the same than the written ones.
    int count = 0;
    for (InputSplit split : format.getSplits(job)) {
        TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
        TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId);
        logger.info("Sampling split: " + split);
        RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext);
        reader.initialize(split, attemptContext);
        while (reader.nextKeyValue()) {
            tuple = reader.getCurrentKey();
            count++;
        }
        reader.close();
    }

    assertEquals(generatedRows, count);

    HadoopUtils.deleteIfExists(fS, outPath);
}

From source file:com.facebook.hiveio.benchmark.InputBenchmark.java

License:Apache License

/**
 * Run benchmark//w w  w.  jav a2s .c  om
 *
 * @param args parsed args
 * @throws Exception
 */
public void run(InputBenchmarkCmd args) throws Exception {
    HadoopNative.requireHadoopNative();

    Timer allTime = Metrics.newTimer(InputBenchmark.class, "all-time", MILLISECONDS, MILLISECONDS);
    TimerContext allTimerContext = allTime.time();

    HiveInputDescription input = new HiveInputDescription();
    input.getTableDesc().setDatabaseName(args.tableOpts.database);
    input.getTableDesc().setTableName(args.tableOpts.table);
    input.setPartitionFilter(args.tableOpts.partitionFilter);
    input.getMetastoreDesc().setHost(args.metastoreOpts.host);
    input.getMetastoreDesc().setPort(args.metastoreOpts.port);

    HiveConf hiveConf = HiveUtils.newHiveConf(InputBenchmark.class);

    System.err.println("Initialize profile with input data");
    HiveApiInputFormat.setProfileInputDesc(hiveConf, input, DEFAULT_PROFILE_ID);

    HiveApiInputFormat defaultInputFormat = new HiveApiInputFormat();
    if (args.trackMetrics) {
        defaultInputFormat.setObserver(new MetricsObserver("default", args.recordPrintPeriod));
    }

    List<InputSplit> splits = defaultInputFormat.getSplits(new JobContext(hiveConf, new JobID()));
    System.err.println("getSplits returned " + splits.size() + " splits");

    long numRows = 0;
    for (int i = 0; i < splits.size(); ++i) {
        InputSplit split = splits.get(i);
        TaskAttemptID taskID = new TaskAttemptID();
        TaskAttemptContext taskContext = new TaskAttemptContext(hiveConf, taskID);
        if (i % args.splitPrintPeriod == 0) {
            System.err.println("Handling split " + i + " of " + splits.size());
        }
        RecordReader<WritableComparable, HiveReadableRecord> reader = defaultInputFormat
                .createRecordReader(split, taskContext);
        reader.initialize(split, taskContext);
        numRows += readFully(reader);
    }

    System.err.println("Parsed " + numRows + " rows");

    allTimerContext.stop();

    new ConsoleReporter(System.err).run();
}

From source file:com.facebook.hiveio.tailer.TailerCmd.java

License:Apache License

/**
 * Read input split/*from   ww  w  . ja  v a2s .c o  m*/
 *
 * @param split InputSplit
 * @param context Context
 * @throws IOException
 * @throws InterruptedException
 */
private void readSplit(InputSplit split, Context context) throws IOException, InterruptedException {
    TaskAttemptID taskId = new TaskAttemptID();
    TaskAttemptContext taskContext = new TaskAttemptContext(context.hiveConf, taskId);
    RecordReader<WritableComparable, HiveReadableRecord> recordReader;
    recordReader = context.hiveApiInputFormat.createRecordReader(split, taskContext);
    recordReader.initialize(split, taskContext);

    int rowsParsed = 0;
    while (recordReader.nextKeyValue() && !context.limitReached(args.limit)) {
        HiveReadableRecord record = recordReader.getCurrentValue();
        if (args.parser.parseOnly) {
            rowParser.parse(record);
        } else {
            recordPrinter.printRecord(record, context.schema.numColumns(), context, args);
        }
        ++rowsParsed;
        if (context.rowsParsed.incrementAndGet() >= args.limit) {
            break;
        }
        if (rowsParsed % args.metricsOpts.updateRows == 0) {
            context.stats.addRows(args.metricsOpts.updateRows);
            rowsParsed = 0;
        }
    }
    context.stats.addRows(rowsParsed);
}

From source file:com.google.appengine.tools.mapreduce.AppEngineTaskAttemptContext.java

License:Apache License

/**
 * Reconstitutes the record reader for this invocation of this shard.
 *
 * @param <INKEY> the type of the key to be returned from the reader
 * @param <INVALUE> the type of the value to be returned from the reader
 * @param split the input split from {@link #getInputSplit()}
 * @return the reconstituted record reader
 * @throws IOException if there is an error reading the reader
 *///ww  w .  ja va  2s.c  om
@SuppressWarnings("unchecked")
public <INKEY, INVALUE> RecordReader<INKEY, INVALUE> getRecordReader(InputSplit split) throws IOException {
    byte[] serializedReader = state.getSerializedRecordReader();
    String readerClassName = state.getRecordReaderClassName();
    try {
        RecordReader<INKEY, INVALUE> initReader = (RecordReader<INKEY, INVALUE>) ReflectionUtils
                .newInstance(getConfiguration().getClassByName(readerClassName), getConfiguration());
        initReader.initialize(split, this);
        return SerializationUtil.deserializeFromByteArray(getConfiguration(), RecordReader.class,
                readerClassName, serializedReader, initReader);
    } catch (InterruptedException e) {
        throw new RuntimeException("Got interrupted in a single threaded environment.", e);
    } catch (ClassNotFoundException e) {
        throw new RuntimeException("Couldn't find previously found RecordReader class: " + readerClassName, e);
    }
}