Example usage for org.apache.hadoop.mapreduce InputFormat createRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputFormat createRecordReader.

Prototype

public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Create a record reader for a given split.

Usage

From source file:org.apache.jena.tdbloader4.partitioners.SplitSampler.java

License:Apache License

/**
 * From each split sampled, take the first numSamples / numSplits records.
 *//*www.j  a v a2s.  c  om*/
@SuppressWarnings("unchecked")
// ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
    List<InputSplit> splits = inf.getSplits(job);
    ArrayList<K> samples = new ArrayList<K>(numSamples);
    int splitsToSample = Math.min(maxSplitsSampled, splits.size());
    int samplesPerSplit = numSamples / splitsToSample;
    log.debug("Sampling {} splits, taking {} samples per split", splitsToSample, samplesPerSplit);
    long records = 0;
    for (int i = 0; i < splitsToSample; ++i) {
        TaskAttemptContext samplingContext = new TaskAttemptContext(job.getConfiguration(),
                new TaskAttemptID());
        InputSplit split = splits.get(i);
        log.debug("Sampling {} split", split);
        RecordReader<K, V> reader = inf.createRecordReader(split, samplingContext);
        reader.initialize(split, samplingContext);
        while (reader.nextKeyValue()) {
            LongQuadWritable currentKey = (LongQuadWritable) reader.getCurrentKey();
            // TODO: why do we need to do that? Why on earth we have -1 in subject, predicate or object position???
            if ((currentKey.get(0) > 0) && (currentKey.get(1) > 0) && (currentKey.get(2) > 0)) {
                LongQuadWritable key = new LongQuadWritable(currentKey.get(0), currentKey.get(1),
                        currentKey.get(2), currentKey.get(3));
                log.debug("Sampled {}", key);
                samples.add((K) key);
                ++records;
                if (records >= (i + 1) * samplesPerSplit) {
                    log.debug("Records is {} and (i + 1) * samplesPerSplit is {}", records,
                            (i + 1) * samplesPerSplit);
                    break;
                }
            }
        }
        reader.close();
    }
    return (K[]) samples.toArray();
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteBufferData" })
public void testReadBufferData() throws Exception {
    long reccnt = 0L;
    long tsize = 0L;
    byte[] buf;/*from   w w  w .j  av  a 2s .  co m*/
    Checksum cs = new CRC32();
    cs.reset();
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            m_partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(m_partfns); // keep the order for checksum
    for (int idx = 0; idx < m_partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", m_partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, m_partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableBuffer<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableBuffer<?>>, DurableBuffer<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableBuffer<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableBuffer<?>> dbufval = null;
        while (reader.nextKeyValue()) {
            dbufval = reader.getCurrentValue();
            assert dbufval.getValue().getSize() == dbufval.getValue().get().capacity();
            dbufval.getValue().get().clear();
            buf = new byte[dbufval.getValue().get().capacity()];
            dbufval.getValue().get().get(buf);
            cs.update(buf, 0, buf.length);
            tsize += dbufval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of buffer is %d", m_checksum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceChunkDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" })
public void testReadChunkData() throws Exception {
    List<String> partfns = new ArrayList<String>();
    long reccnt = 0L;
    long tsize = 0L;
    Checksum cs = new CRC32();
    cs.reset();/*ww w. j  a v  a  2  s .  com*/
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(partfns); // keep the order for checksum
    for (int idx = 0; idx < partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableChunk<?>> dchkval = null;
        while (reader.nextKeyValue()) {
            dchkval = reader.getCurrentValue();
            byte b;
            for (int j = 0; j < dchkval.getValue().getSize(); ++j) {
                b = unsafe.getByte(dchkval.getValue().get() + j);
                cs.update(b);
            }
            tsize += dchkval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of chunk is %d", m_checksum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceLongDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteLongData" })
public void testReadLongData() throws Exception {
    long sum = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L,
                    new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat = new MneInputFormat<MneDurableInputValue<Long>, Long>();
            RecordReader<NullWritable, MneDurableInputValue<Long>> reader = inputFormat
                    .createRecordReader(split, m_tacontext);
            MneDurableInputValue<Long> mdval = null;
            while (reader.nextKeyValue()) {
                mdval = reader.getCurrentValue();
                sum += mdval.getValue();
                ++reccnt;/*www  .j a v a 2 s.c o  m*/
            }
            reader.close();
        }
    }
    AssertJUnit.assertEquals(m_sum, sum);
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    System.out.println(String.format("The checksum of long data is %d", sum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreducePersonDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
    long sumage = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L,
                    new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
            RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat
                    .createRecordReader(split, m_tacontext);
            MneDurableInputValue<Person<Long>> personval = null;
            while (reader.nextKeyValue()) {
                personval = reader.getCurrentValue();
                AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
                sumage += personval.getValue().getAge();
                ++reccnt;//  w w w. ja v  a  2s. c  o  m
            }
            reader.close();
        }
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_sumage, sumage);
    System.out.println(String.format("The checksum of ages is %d", sumage));
}

From source file:org.apache.parquet.pig.PerfTest2.java

License:Apache License

static void load(String out, int colsToLoad, StringBuilder results) throws Exception {
    StringBuilder schemaString = new StringBuilder("a0: chararray");
    for (int i = 1; i < colsToLoad; i++) {
        schemaString.append(", a" + i + ": chararray");
    }/*from ww w .j a  v  a 2  s.  c o m*/

    long t0 = System.currentTimeMillis();
    Job job = new Job(conf);
    int loadjobId = jobid++;
    LoadFunc loadFunc = new ParquetLoader(schemaString.toString());
    loadFunc.setUDFContextSignature("sigLoader" + loadjobId);
    String absPath = loadFunc.relativeToAbsolutePath(out, new Path(new File(".").getAbsoluteFile().toURI()));
    loadFunc.setLocation(absPath, job);
    @SuppressWarnings("unchecked") // that's how the base class is defined
    InputFormat<Void, Tuple> inputFormat = loadFunc.getInputFormat();
    JobContext jobContext = ContextUtil.newJobContext(ContextUtil.getConfiguration(job),
            new JobID("jt", loadjobId));
    List<InputSplit> splits = inputFormat.getSplits(jobContext);
    int i = 0;
    int taskid = 0;
    for (InputSplit split : splits) {
        TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext(
                ContextUtil.getConfiguration(job), new TaskAttemptID("jt", loadjobId, true, taskid++, 0));
        RecordReader<Void, Tuple> recordReader = inputFormat.createRecordReader(split, taskAttemptContext);
        loadFunc.prepareToRead(recordReader, null);
        recordReader.initialize(split, taskAttemptContext);
        Tuple t;
        while ((t = loadFunc.getNext()) != null) {
            if (Log.DEBUG)
                System.out.println(t);
            ++i;
        }
    }
    assertEquals(ROW_COUNT, i);
    long t1 = System.currentTimeMillis();
    results.append((t1 - t0) + " ms to read " + colsToLoad + " columns\n");
}

From source file:org.apache.pig.piggybank.storage.IndexedStorage.java

License:Apache License

/**
 * IndexableLoadFunc interface implementation
 *///from   ww  w. ja  va2 s. c  o m
@Override
public void initialize(Configuration conf) throws IOException {
    try {
        InputFormat inputFormat = this.getInputFormat();
        TaskAttemptID id = HadoopShims.getNewTaskAttemptID();

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
            conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }
        List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
        this.readers = new IndexedStorageRecordReader[fileSplits.size()];

        int idx = 0;
        Iterator<FileSplit> it = fileSplits.iterator();
        while (it.hasNext()) {
            FileSplit fileSplit = it.next();
            TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
            IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat
                    .createRecordReader(fileSplit, context);
            r.initialize(fileSplit, context);
            this.readers[idx] = r;
            idx++;
        }

        Arrays.sort(this.readers, this.readerComparator);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
}

From source file:org.apache.pig.test.TestReadToEndLoader.java

License:Apache License

@Test
public void testIsReaderForLastSplitClosed() throws Exception {
    final LoadFunc loadFunc = mock(LoadFunc.class);
    final InputFormat inputFormat = mock(InputFormat.class);
    final RecordReader recordReader = mock(RecordReader.class);
    final InputSplit inputSplit = mock(InputSplit.class);
    // Define behavior
    when(loadFunc.getInputFormat()).thenReturn(inputFormat);
    when(inputFormat.createRecordReader(any(InputSplit.class), any(TaskAttemptContext.class)))
            .thenReturn(recordReader);/*from  w  w  w.  j ava  2  s .  c  om*/
    when(inputFormat.getSplits(any(JobContext.class))).thenReturn(Arrays.asList(inputSplit));
    Configuration conf = new Configuration();
    ReadToEndLoader loader = new ReadToEndLoader(loadFunc, conf, "loc", 0);
    // This will return null since we haven't specified any behavior for this method
    Assert.assertNull(loader.getNext());
    // Verify that RecordReader.close for the last input split is called once
    verify(recordReader, times(1)).close();
}

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.GraphFilterRecordReader.java

License:Apache License

@Override
public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext)
        throws IOException, InterruptedException {
    final Configuration configuration = taskAttemptContext.getConfiguration();
    final InputFormat<NullWritable, VertexWritable> inputFormat = ReflectionUtils.newInstance(
            configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class),
            configuration);/*from   ww  w  .  ja v  a  2  s .c  om*/
    if (!(inputFormat instanceof GraphFilterAware)
            && configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null)
        this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration),
                Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    this.recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext);
    this.recordReader.initialize(inputSplit, taskAttemptContext);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HadoopElementIterator.java

License:Apache License

public HadoopElementIterator(final HadoopGraph graph,
        final InputFormat<NullWritable, VertexWritable> inputFormat, final Path path)
        throws IOException, InterruptedException {
    this.graph = graph;
    final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration());
    for (final FileStatus status : FileSystem.get(configuration).listStatus(path,
            HiddenFileFilter.instance())) {
        this.readers.add(inputFormat.createRecordReader(
                new FileSplit(status.getPath(), 0, Integer.MAX_VALUE, new String[] {}),
                new TaskAttemptContext(configuration, new TaskAttemptID())));
    }/* ww  w . j  ava2  s .  co m*/
}