List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader
public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.apache.jena.tdbloader4.partitioners.SplitSampler.java
License:Apache License
/** * From each split sampled, take the first numSamples / numSplits records. *//*www.j a v a2s. c om*/ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); int samplesPerSplit = numSamples / splitsToSample; log.debug("Sampling {} splits, taking {} samples per split", splitsToSample, samplesPerSplit); long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); InputSplit split = splits.get(i); log.debug("Sampling {} split", split); RecordReader<K, V> reader = inf.createRecordReader(split, samplingContext); reader.initialize(split, samplingContext); while (reader.nextKeyValue()) { LongQuadWritable currentKey = (LongQuadWritable) reader.getCurrentKey(); // TODO: why do we need to do that? Why on earth we have -1 in subject, predicate or object position??? if ((currentKey.get(0) > 0) && (currentKey.get(1) > 0) && (currentKey.get(2) > 0)) { LongQuadWritable key = new LongQuadWritable(currentKey.get(0), currentKey.get(1), currentKey.get(2), currentKey.get(3)); log.debug("Sampled {}", key); samples.add((K) key); ++records; if (records >= (i + 1) * samplesPerSplit) { log.debug("Records is {} and (i + 1) * samplesPerSplit is {}", records, (i + 1) * samplesPerSplit); break; } } } reader.close(); } return (K[]) samples.toArray(); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWriteBufferData" }) public void testReadBufferData() throws Exception { long reccnt = 0L; long tsize = 0L; byte[] buf;/*from w w w .j av a 2s . co m*/ Checksum cs = new CRC32(); cs.reset(); File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { m_partfns.add(listfiles[idx].getName()); } } Collections.sort(m_partfns); // keep the order for checksum for (int idx = 0; idx < m_partfns.size(); ++idx) { System.out.println(String.format("Verifying : %s", m_partfns.get(idx))); FileSplit split = new FileSplit(new Path(m_workdir, m_partfns.get(idx)), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<DurableBuffer<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableBuffer<?>>, DurableBuffer<?>>(); RecordReader<NullWritable, MneDurableInputValue<DurableBuffer<?>>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<DurableBuffer<?>> dbufval = null; while (reader.nextKeyValue()) { dbufval = reader.getCurrentValue(); assert dbufval.getValue().getSize() == dbufval.getValue().get().capacity(); dbufval.getValue().get().clear(); buf = new byte[dbufval.getValue().get().capacity()]; dbufval.getValue().get().get(buf); cs.update(buf, 0, buf.length); tsize += dbufval.getValue().getSize(); ++reccnt; } reader.close(); } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_totalsize, tsize); AssertJUnit.assertEquals(m_checksum, cs.getValue()); System.out.println(String.format("The checksum of buffer is %d", m_checksum)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceChunkDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" }) public void testReadChunkData() throws Exception { List<String> partfns = new ArrayList<String>(); long reccnt = 0L; long tsize = 0L; Checksum cs = new CRC32(); cs.reset();/*ww w. j a v a 2 s . com*/ File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { partfns.add(listfiles[idx].getName()); } } Collections.sort(partfns); // keep the order for checksum for (int idx = 0; idx < partfns.size(); ++idx) { System.out.println(String.format("Verifying : %s", partfns.get(idx))); FileSplit split = new FileSplit(new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>(); RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<DurableChunk<?>> dchkval = null; while (reader.nextKeyValue()) { dchkval = reader.getCurrentValue(); byte b; for (int j = 0; j < dchkval.getValue().getSize(); ++j) { b = unsafe.getByte(dchkval.getValue().get() + j); cs.update(b); } tsize += dchkval.getValue().getSize(); ++reccnt; } reader.close(); } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_totalsize, tsize); AssertJUnit.assertEquals(m_checksum, cs.getValue()); System.out.println(String.format("The checksum of chunk is %d", m_checksum)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceLongDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWriteLongData" }) public void testReadLongData() throws Exception { long sum = 0L; long reccnt = 0L; File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { System.out.println(String.format("Verifying : %s", listfiles[idx].getName())); FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat = new MneInputFormat<MneDurableInputValue<Long>, Long>(); RecordReader<NullWritable, MneDurableInputValue<Long>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<Long> mdval = null; while (reader.nextKeyValue()) { mdval = reader.getCurrentValue(); sum += mdval.getValue(); ++reccnt;/*www .j a v a 2 s.c o m*/ } reader.close(); } } AssertJUnit.assertEquals(m_sum, sum); AssertJUnit.assertEquals(m_reccnt, reccnt); System.out.println(String.format("The checksum of long data is %d", sum)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreducePersonDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWritePersonData" }) public void testReadPersonData() throws Exception { long sumage = 0L; long reccnt = 0L; File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { System.out.println(String.format("Verifying : %s", listfiles[idx].getName())); FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>(); RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<Person<Long>> personval = null; while (reader.nextKeyValue()) { personval = reader.getCurrentValue(); AssertJUnit.assertTrue(personval.getValue().getAge() < 51); sumage += personval.getValue().getAge(); ++reccnt;// w w w. ja v a 2s. c o m } reader.close(); } } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_sumage, sumage); System.out.println(String.format("The checksum of ages is %d", sumage)); }
From source file:org.apache.parquet.pig.PerfTest2.java
License:Apache License
static void load(String out, int colsToLoad, StringBuilder results) throws Exception { StringBuilder schemaString = new StringBuilder("a0: chararray"); for (int i = 1; i < colsToLoad; i++) { schemaString.append(", a" + i + ": chararray"); }/*from ww w .j a v a 2 s. c o m*/ long t0 = System.currentTimeMillis(); Job job = new Job(conf); int loadjobId = jobid++; LoadFunc loadFunc = new ParquetLoader(schemaString.toString()); loadFunc.setUDFContextSignature("sigLoader" + loadjobId); String absPath = loadFunc.relativeToAbsolutePath(out, new Path(new File(".").getAbsoluteFile().toURI())); loadFunc.setLocation(absPath, job); @SuppressWarnings("unchecked") // that's how the base class is defined InputFormat<Void, Tuple> inputFormat = loadFunc.getInputFormat(); JobContext jobContext = ContextUtil.newJobContext(ContextUtil.getConfiguration(job), new JobID("jt", loadjobId)); List<InputSplit> splits = inputFormat.getSplits(jobContext); int i = 0; int taskid = 0; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID("jt", loadjobId, true, taskid++, 0)); RecordReader<Void, Tuple> recordReader = inputFormat.createRecordReader(split, taskAttemptContext); loadFunc.prepareToRead(recordReader, null); recordReader.initialize(split, taskAttemptContext); Tuple t; while ((t = loadFunc.getNext()) != null) { if (Log.DEBUG) System.out.println(t); ++i; } } assertEquals(ROW_COUNT, i); long t1 = System.currentTimeMillis(); results.append((t1 - t0) + " ms to read " + colsToLoad + " columns\n"); }
From source file:org.apache.pig.piggybank.storage.IndexedStorage.java
License:Apache License
/** * IndexableLoadFunc interface implementation *///from ww w. ja va2 s. c o m @Override public void initialize(Configuration conf) throws IOException { try { InputFormat inputFormat = this.getInputFormat(); TaskAttemptID id = HadoopShims.getNewTaskAttemptID(); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null)); this.readers = new IndexedStorageRecordReader[fileSplits.size()]; int idx = 0; Iterator<FileSplit> it = fileSplits.iterator(); while (it.hasNext()) { FileSplit fileSplit = it.next(); TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id); IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat .createRecordReader(fileSplit, context); r.initialize(fileSplit, context); this.readers[idx] = r; idx++; } Arrays.sort(this.readers, this.readerComparator); } catch (InterruptedException e) { throw new IOException(e); } }
From source file:org.apache.pig.test.TestReadToEndLoader.java
License:Apache License
@Test public void testIsReaderForLastSplitClosed() throws Exception { final LoadFunc loadFunc = mock(LoadFunc.class); final InputFormat inputFormat = mock(InputFormat.class); final RecordReader recordReader = mock(RecordReader.class); final InputSplit inputSplit = mock(InputSplit.class); // Define behavior when(loadFunc.getInputFormat()).thenReturn(inputFormat); when(inputFormat.createRecordReader(any(InputSplit.class), any(TaskAttemptContext.class))) .thenReturn(recordReader);/*from w w w. j ava 2 s . c om*/ when(inputFormat.getSplits(any(JobContext.class))).thenReturn(Arrays.asList(inputSplit)); Configuration conf = new Configuration(); ReadToEndLoader loader = new ReadToEndLoader(loadFunc, conf, "loc", 0); // This will return null since we haven't specified any behavior for this method Assert.assertNull(loader.getNext()); // Verify that RecordReader.close for the last input split is called once verify(recordReader, times(1)).close(); }
From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.GraphFilterRecordReader.java
License:Apache License
@Override public void initialize(final InputSplit inputSplit, final TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { final Configuration configuration = taskAttemptContext.getConfiguration(); final InputFormat<NullWritable, VertexWritable> inputFormat = ReflectionUtils.newInstance( configuration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class, InputFormat.class), configuration);/*from ww w . ja v a 2 s .c om*/ if (!(inputFormat instanceof GraphFilterAware) && configuration.get(Constants.GREMLIN_HADOOP_GRAPH_FILTER, null) != null) this.graphFilter = VertexProgramHelper.deserialize(ConfUtil.makeApacheConfiguration(configuration), Constants.GREMLIN_HADOOP_GRAPH_FILTER); this.recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext); this.recordReader.initialize(inputSplit, taskAttemptContext); }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HadoopElementIterator.java
License:Apache License
public HadoopElementIterator(final HadoopGraph graph, final InputFormat<NullWritable, VertexWritable> inputFormat, final Path path) throws IOException, InterruptedException { this.graph = graph; final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration()); for (final FileStatus status : FileSystem.get(configuration).listStatus(path, HiddenFileFilter.instance())) { this.readers.add(inputFormat.createRecordReader( new FileSplit(status.getPath(), 0, Integer.MAX_VALUE, new String[] {}), new TaskAttemptContext(configuration, new TaskAttemptID()))); }/* ww w . j ava2 s . co m*/ }