List of usage examples for org.apache.hadoop.mapred RecordReader getPos
long getPos() throws IOException;
From source file:cascading.tap.hadoop.ZipInputFormatTest.java
License:Open Source License
public void testSplits() throws Exception { JobConf job = new JobConf(); FileSystem currentFs = FileSystem.get(job); Path file = new Path(workDir, "test.zip"); Reporter reporter = Reporter.NULL;//from w w w . j ava 2 s . c o m int seed = new Random().nextInt(); LOG.info("seed = " + seed); Random random = new Random(seed); FileInputFormat.setInputPaths(job, file); for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) { ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream); long length = 0; LOG.debug("creating; zip file with entries = " + entries); // for each entry in the zip file for (int entryCounter = 0; entryCounter < entries; entryCounter++) { // construct zip entries splitting MAX_LENGTH between entries long entryLength = MAX_LENGTH / entries; ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt"); zipEntry.setMethod(ZipEntry.DEFLATED); zos.putNextEntry(zipEntry); for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) { zos.write(Long.toString(length).getBytes()); zos.write("\n".getBytes()); } zos.flush(); zos.closeEntry(); } zos.flush(); zos.close(); currentFs.delete(file, true); OutputStream outputStream = currentFs.create(file); byteArrayOutputStream.writeTo(outputStream); outputStream.close(); ZipInputFormat format = new ZipInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); InputSplit[] splits = format.getSplits(job, 100); BitSet bits = new BitSet((int) length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos()); assertFalse("key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.close(); } } assertEquals("some keys in no partition.", length, bits.cardinality()); } }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector;/*ww w . ja va2 s.c o m*/ synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); ReaderWriterProfiler.setProfilerOptions(conf); writer.write(serde.serialize(new MyRow(1, 2), inspector)); writer.write(serde.serialize(new MyRow(2, 2), inspector)); writer.write(serde.serialize(new MyRow(3, 2), inspector)); writer.close(true); serde = new OrcSerde(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); serde.initialize(conf, properties); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // the the validate input method ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(testFilePath)); assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(workDir)); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); // read the whole file org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); assertEquals(1.0, reader.getProgress(), 0.00001); reader.close(); // read just the first column conf.set("hive.io.file.readcolumn.ids", "0"); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(null, inspector.getStructFieldData(value, fields.get(1))); } assertEquals(3, rowNum); reader.close(); // test the mapping of empty string to all columns conf.set("hive.io.file.readcolumn.ids", ""); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testEmptyFile() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);/*ww w. j av a 2 s .c o m*/ writer.close(true); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); SerDe serde = new OrcSerde(); serde.initialize(conf, properties); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file conf.set("hive.io.file.readcolumn.ids", "0,1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); assertEquals(false, reader.next(key, value)); reader.close(); assertEquals(null, serde.getSerDeStats()); }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java
License:Apache License
@Override public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { setColumns(job);/*from w w w . j a v a2s .c o m*/ final RecordReader<ImmutableBytesWritable, Result> rr = delegate .getRecordReader(((HBaseSplit) split).getSnapshotSplit(), job, reporter); return new RecordReader<ImmutableBytesWritable, ResultWritable>() { @Override public boolean next(ImmutableBytesWritable key, ResultWritable value) throws IOException { return rr.next(key, value.getResult()); } @Override public ImmutableBytesWritable createKey() { return rr.createKey(); } @Override public ResultWritable createValue() { return new ResultWritable(rr.createValue()); } @Override public long getPos() throws IOException { return rr.getPos(); } @Override public void close() throws IOException { rr.close(); } @Override public float getProgress() throws IOException { return rr.getProgress(); } }; }
From source file:com.ibm.jaql.io.hadoop.CompositeInputAdapter.java
License:Apache License
@SuppressWarnings("unchecked") public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { CompositeSplit cSplit = (CompositeSplit) split; // 1. get the InputAdapter's array index (i) from the split final int idx = cSplit.getAdapterIdx(); InputSplit baseSplit = cSplit.getSplit(); try {//from w w w . j av a2s.c om // 2. get the ith adapter's args record JsonValue value = this.args.get(idx); // JRecord baseArgs = (JRecord) item.getNonNull(); // record the current index to the job conf // ASSUMES: in map/reduce, the format's record reader is called *before* // the map class is configured writeCurrentIndex(job, idx); // FIXME: no longer needed // 3. insantiate and initialize the adapter HadoopInputAdapter adapter = (HadoopInputAdapter) AdapterStore.getStore().input .getAdapter(/** baseArgs, */ value); // 4. create a new JobConf j' JobConf jTmp = new JobConf(job); // 5. call adapter's setupConf(j') // ConfiguratorUtil.writeToConf(adapter, jTmp, item/**baseArgs*/); adapter.setParallel(jTmp); // 6. configure the adapter from j' adapter.configure(jTmp); // 7. call adapter's getRecordReader with j' final RecordReader<JsonHolder, JsonHolder> reader = (RecordReader<JsonHolder, JsonHolder>) adapter .getRecordReader(baseSplit, jTmp, reporter); if (!addIndex) { return reader; } return new RecordReader<JsonHolder, JsonHolder>() { @Override public void close() throws IOException { reader.close(); } @Override public JsonHolder createKey() { return reader.createKey(); } @Override public JsonHolder createValue() { return reader.createValue(); } @Override public long getPos() throws IOException { return reader.getPos(); } @Override public float getProgress() throws IOException { return reader.getProgress(); } @Override public boolean next(JsonHolder key, JsonHolder value) throws IOException { BufferedJsonArray pair = (BufferedJsonArray) value.value; if (pair != null) { value.value = pair.get(1); } else { pair = new BufferedJsonArray(2); pair.set(0, JsonLong.make(idx)); } if (reader.next(key, value)) { pair.set(1, value.value); value.value = pair; return true; } return false; } }; } catch (Exception e) { return null; } }
From source file:com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter.java
License:Apache License
@SuppressWarnings("unchecked") public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { if (split instanceof DHIASplit) { // not using order-preserving wrapper split = ((DHIASplit) split).split; }/*from w ww . java2 s. c o m*/ if (converter == null) return ((InputFormat<JsonHolder, JsonHolder>) iFormat).getRecordReader(split, job, reporter); final RecordReader<K, V> baseReader = ((InputFormat<K, V>) iFormat).getRecordReader(split, job, reporter); final K baseKey = baseReader.createKey(); final V baseValue = baseReader.createValue(); return new RecordReader<JsonHolder, JsonHolder>() { public void close() throws IOException { baseReader.close(); } public JsonHolder createKey() { return keyHolder(); } public JsonHolder createValue() { JsonHolder holder = valueHolder(); holder.value = converter.createTarget(); return holder; } public long getPos() throws IOException { return baseReader.getPos(); } public float getProgress() throws IOException { return baseReader.getProgress(); } public boolean next(JsonHolder key, JsonHolder value) throws IOException { boolean hasMore = baseReader.next(baseKey, baseValue); if (!hasMore) return false; value.value = converter.convert(baseKey, baseValue, value.value); return true; } }; }