List of usage examples for org.apache.hadoop.mapred RecordReader getProgress
float getProgress() throws IOException;
From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java
License:Apache License
public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getSplit(); String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName))); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); List<String> hbaseColumnFamilies = new ArrayList<String>(); List<String> hbaseColumnQualifiers = new ArrayList<String>(); List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); int iKey;/*from w w w.j a v a 2 s . c om*/ try { iKey = parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); } catch (Exception se) { throw new IOException(se); } List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (hbaseColumnFamilies.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); } boolean addAll = (readColIDs.size() == 0); Scan scan = new Scan(); boolean empty = true; if (!addAll) { for (int i : readColIDs) { if (i == iKey) { continue; } scan.addFamily(hbaseColumnFamiliesBytes.get(i)); empty = false; } } // The HBase table's row key maps to a Hive table column. In the corner case when only the // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ // column qualifier will have been added to the scan. We arbitrarily add at least one column // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive // tables column projection. if (empty) { for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } if (!addAll) { break; } } } //setting start and end time for scanning setTime(jobConf, scan); // If Hive's optimizer gave us a filter to process, convert it to the // HBase scan form now. tableSplit = convertFilter(jobConf, scan, tableSplit, iKey); setScan(scan); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader( tableSplit, tac); return new RecordReader<Text, Text>() { //@Override public void close() throws IOException { recordReader.close(); } // @Override public Text createKey() { return new Text(); } // @Override public Text createValue() { return new Text(); } // @Override public long getPos() throws IOException { return 0; } // @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } // @Override public boolean next(Text rowKey, Text value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); //logic for to find the column name if (next) { rowKey.set(Bytes.toString(recordReader.getCurrentValue().getRow())); StringBuilder val = new StringBuilder(); String prev = ""; for (KeyValue kv : recordReader.getCurrentValue().raw()) { String current = new String(kv.getQualifier()); char[] col = new String(current).toCharArray(); if (val.length() > 0) { if (prev.equals(current)) val.append(","); else val.append("\t"); } prev = current; val.append(col[0]).append("_"); val.append(Bytes.toString(kv.getValue())); } value.set(val.toString()); // rowKey.set(Bytes.toString(recordReader.getCurrentValue().getRow()));; // value.set(Bytes.toString(recordReader.getCurrentValue().value())); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }
From source file:com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java
License:Apache License
public RecordReader<ImmutableBytesWritable, Result> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getSplit(); String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName))); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); List<String> hbaseColumnFamilies = new ArrayList<String>(); List<String> hbaseColumnQualifiers = new ArrayList<String>(); List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); int iKey;/* ww w. ja v a 2 s.c o m*/ try { iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); } catch (SerDeException se) { throw new IOException(se); } List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (hbaseColumnFamilies.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); } boolean addAll = (readColIDs.size() == 0); Scan scan = new Scan(); boolean empty = true; if (!addAll) { for (int i : readColIDs) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } empty = false; } } // The HBase table's row key maps to a Hive table column. In the corner case when only the // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ // column qualifier will have been added to the scan. We arbitrarily add at least one column // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive // tables column projection. if (empty) { for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } if (!addAll) { break; } } } //setting start and end time for scanning setTime(jobConf, scan); // If Hive's optimizer gave us a filter to process, convert it to the // HBase scan form now. tableSplit = convertFilter(jobConf, scan, tableSplit, iKey); setScan(scan); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader( tableSplit, tac); return new RecordReader<ImmutableBytesWritable, Result>() { //@Override public void close() throws IOException { recordReader.close(); } // @Override public ImmutableBytesWritable createKey() { return new ImmutableBytesWritable(); } // @Override public Result createValue() { return new Result(); } // @Override public long getPos() throws IOException { return 0; } // @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } // @Override public boolean next(ImmutableBytesWritable rowKey, Result value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); if (next) { rowKey.set(recordReader.getCurrentValue().getRow()); Writables.copyWritable(recordReader.getCurrentValue(), value); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector;/*from w w w . j ava 2s.c o m*/ synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); ReaderWriterProfiler.setProfilerOptions(conf); writer.write(serde.serialize(new MyRow(1, 2), inspector)); writer.write(serde.serialize(new MyRow(2, 2), inspector)); writer.write(serde.serialize(new MyRow(3, 2), inspector)); writer.close(true); serde = new OrcSerde(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); serde.initialize(conf, properties); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // the the validate input method ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(testFilePath)); assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(workDir)); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); // read the whole file org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); assertEquals(1.0, reader.getProgress(), 0.00001); reader.close(); // read just the first column conf.set("hive.io.file.readcolumn.ids", "0"); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(null, inspector.getStructFieldData(value, fields.get(1))); } assertEquals(3, rowNum); reader.close(); // test the mapping of empty string to all columns conf.set("hive.io.file.readcolumn.ids", ""); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); reader.close(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testEmptyFile() throws Exception { JobConf job = new JobConf(conf); Properties properties = new Properties(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL);/*from ww w .j a v a 2 s .c o m*/ writer.close(true); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); SerDe serde = new OrcSerde(); serde.initialize(conf, properties); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // read the whole file conf.set("hive.io.file.readcolumn.ids", "0,1"); org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); assertEquals(false, reader.next(key, value)); reader.close(); assertEquals(null, serde.getSerDeStats()); }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java
License:Apache License
@Override public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getTableSplit(); Job job = new Job(jobConf); TaskAttemptContext tac = ShimLoader.getHadoopShims().newTaskAttemptContext(job.getConfiguration(), reporter);//from ww w.java 2 s.c o m final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader; if (hbaseSplit.isTxIndexScan()) { LOG.info("getRecordReader: TxHiveIndexScan -> " + tableSplit); recordReader = TxHiveTableInputFormatUtil.createRecordReader(tableSplit, tac, jobConf); } else { LOG.info("getRecordReader: no TxHiveIndexScan -> " + tableSplit); setHTable(HiveHBaseInputFormatUtil.getTable(jobConf)); setScan(HiveHBaseInputFormatUtil.getScan(jobConf)); recordReader = createRecordReader(tableSplit, tac); } try { recordReader.initialize(tableSplit, tac); } catch (InterruptedException e) { throw new IOException("Failed to initialize RecordReader", e); } return new RecordReader<ImmutableBytesWritable, ResultWritable>() { @Override public void close() throws IOException { recordReader.close(); closeTable(); } @Override public ImmutableBytesWritable createKey() { return new ImmutableBytesWritable(); } @Override public ResultWritable createValue() { return new ResultWritable(new Result()); } @Override public long getPos() throws IOException { return 0; } @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } @Override public boolean next(ImmutableBytesWritable rowKey, ResultWritable value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); if (next) { rowKey.set(recordReader.getCurrentValue().getRow()); value.setResult(recordReader.getCurrentValue()); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableSnapshotInputFormat.java
License:Apache License
@Override public RecordReader<ImmutableBytesWritable, ResultWritable> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { setColumns(job);/*from w ww .jav a 2 s . c om*/ final RecordReader<ImmutableBytesWritable, Result> rr = delegate .getRecordReader(((HBaseSplit) split).getSnapshotSplit(), job, reporter); return new RecordReader<ImmutableBytesWritable, ResultWritable>() { @Override public boolean next(ImmutableBytesWritable key, ResultWritable value) throws IOException { return rr.next(key, value.getResult()); } @Override public ImmutableBytesWritable createKey() { return rr.createKey(); } @Override public ResultWritable createValue() { return new ResultWritable(rr.createValue()); } @Override public long getPos() throws IOException { return rr.getPos(); } @Override public void close() throws IOException { rr.close(); } @Override public float getProgress() throws IOException { return rr.getProgress(); } }; }
From source file:com.ibm.jaql.io.hadoop.CompositeInputAdapter.java
License:Apache License
@SuppressWarnings("unchecked") public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { CompositeSplit cSplit = (CompositeSplit) split; // 1. get the InputAdapter's array index (i) from the split final int idx = cSplit.getAdapterIdx(); InputSplit baseSplit = cSplit.getSplit(); try {//from ww w. java 2 s.c o m // 2. get the ith adapter's args record JsonValue value = this.args.get(idx); // JRecord baseArgs = (JRecord) item.getNonNull(); // record the current index to the job conf // ASSUMES: in map/reduce, the format's record reader is called *before* // the map class is configured writeCurrentIndex(job, idx); // FIXME: no longer needed // 3. insantiate and initialize the adapter HadoopInputAdapter adapter = (HadoopInputAdapter) AdapterStore.getStore().input .getAdapter(/** baseArgs, */ value); // 4. create a new JobConf j' JobConf jTmp = new JobConf(job); // 5. call adapter's setupConf(j') // ConfiguratorUtil.writeToConf(adapter, jTmp, item/**baseArgs*/); adapter.setParallel(jTmp); // 6. configure the adapter from j' adapter.configure(jTmp); // 7. call adapter's getRecordReader with j' final RecordReader<JsonHolder, JsonHolder> reader = (RecordReader<JsonHolder, JsonHolder>) adapter .getRecordReader(baseSplit, jTmp, reporter); if (!addIndex) { return reader; } return new RecordReader<JsonHolder, JsonHolder>() { @Override public void close() throws IOException { reader.close(); } @Override public JsonHolder createKey() { return reader.createKey(); } @Override public JsonHolder createValue() { return reader.createValue(); } @Override public long getPos() throws IOException { return reader.getPos(); } @Override public float getProgress() throws IOException { return reader.getProgress(); } @Override public boolean next(JsonHolder key, JsonHolder value) throws IOException { BufferedJsonArray pair = (BufferedJsonArray) value.value; if (pair != null) { value.value = pair.get(1); } else { pair = new BufferedJsonArray(2); pair.set(0, JsonLong.make(idx)); } if (reader.next(key, value)) { pair.set(1, value.value); value.value = pair; return true; } return false; } }; } catch (Exception e) { return null; } }
From source file:com.ibm.jaql.io.hadoop.DefaultHadoopInputAdapter.java
License:Apache License
@SuppressWarnings("unchecked") public RecordReader<JsonHolder, JsonHolder> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { if (split instanceof DHIASplit) { // not using order-preserving wrapper split = ((DHIASplit) split).split; }// w ww . j ava2s .c o m if (converter == null) return ((InputFormat<JsonHolder, JsonHolder>) iFormat).getRecordReader(split, job, reporter); final RecordReader<K, V> baseReader = ((InputFormat<K, V>) iFormat).getRecordReader(split, job, reporter); final K baseKey = baseReader.createKey(); final V baseValue = baseReader.createValue(); return new RecordReader<JsonHolder, JsonHolder>() { public void close() throws IOException { baseReader.close(); } public JsonHolder createKey() { return keyHolder(); } public JsonHolder createValue() { JsonHolder holder = valueHolder(); holder.value = converter.createTarget(); return holder; } public long getPos() throws IOException { return baseReader.getPos(); } public float getProgress() throws IOException { return baseReader.getProgress(); } public boolean next(JsonHolder key, JsonHolder value) throws IOException { boolean hasMore = baseReader.next(baseKey, baseValue); if (!hasMore) return false; value.value = converter.convert(baseKey, baseValue, value.value); return true; } }; }
From source file:org.datavec.hadoop.records.reader.TestBasicHDFS_Integration.java
License:Apache License
/** * Things we'd need:/* w ww .j av a 2s . c om*/ * 1. JobConf * 2. some way to get input splits * * @throws IOException */ @Test public void testParametersInputSplitSetup() throws IOException { // InputSplit genericSplit = null; // TaskAttemptContext context = null; // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); // app.input.path String split_filename = "src/test/resources/records/reader/SVMLightRecordReaderInput/record_reader_input_test.txt"; Path splitPath = new Path(split_filename); InputSplit[] splits = generateDebugSplits(splitPath, job); System.out.println("split count: " + splits.length); //RecordReader<LongWritable, Text> rr = new LineRecordReader(job, (FileSplit) splits[0]); TextInputFormat format = new TextInputFormat(); format.configure(job); //Reporter reporter = new DummyReporter(); RecordReader<LongWritable, Text> reader = null; LongWritable key = new LongWritable(); Text value = new Text(); final Reporter voidReporter = Reporter.NULL; reader = format.getRecordReader(splits[0], job, voidReporter); //while (rr.) while (reader.getProgress() < 1.0) { boolean hasMore = reader.next(key, value); System.out.println("line: " + value.toString()); } reader.close(); }
From source file:org.terrier.structures.indexing.singlepass.hadoop.BitPostingIndexInputFormat.java
License:Mozilla Public License
/** Test method, runs splits for inverted/lexicon with the command line specified index */ public static void main(String[] args) throws Exception { Index.setIndexLoadingProfileAsRetrieval(false); IndexOnDisk index = Index.createIndex(args[1], args[2]); if (args[0].equals("--splits")) { JobConf job = HadoopPlugin.getJobFactory(BitPostingIndexInputFormat.class.getSimpleName()).newJob(); HadoopUtility.toHConfiguration(index, job); setStructures(job, "inverted", "lexicon"); index.close();//from w w w . j av a 2 s. c o m new BitPostingIndexInputFormat().getSplits(job, 100); } else { JobConf job = HadoopPlugin.getJobFactory(BitPostingIndexInputFormat.class.getSimpleName()).newJob(); setStructures(job, "linksin", "linksin-lookup"); HadoopUtility.toHConfiguration(index, job); index.close(); InputSplit s = new BitPostingIndexInputSplit(new Path(args[3]), Long.parseLong(args[4]), Long.parseLong(args[5]), new String[0], Integer.parseInt(args[6]), Integer.parseInt(args[7])); RecordReader<IntWritable, IntObjectWrapper<IterablePosting>> rr = new BitPostingIndexInputFormat() .getRecordReader(s, job, new Reporter() { public InputSplit getInputSplit() throws UnsupportedOperationException { return null; } @SuppressWarnings({ "rawtypes" }) public void incrCounter(Enum arg0, long arg1) { } public void incrCounter(String arg0, String arg1, long arg2) { } @SuppressWarnings({ "rawtypes" }) public org.apache.hadoop.mapred.Counters.Counter getCounter(Enum arg0) { return null; } public org.apache.hadoop.mapred.Counters.Counter getCounter(String arg0, String arg1) { return null; } public void setStatus(String arg0) { } public void progress() { } }); IntWritable key = rr.createKey(); IntObjectWrapper<IterablePosting> value = rr.createValue(); long pointers = 0; int lastId = 0; int nonZeroEntryCount = 0; float maxProgress = 0; while (rr.next(key, value)) { IterablePosting ip = value.getObject(); lastId = key.get(); while (ip.next() != IterablePosting.EOL) { pointers++; } nonZeroEntryCount++; if (rr.getProgress() > maxProgress) maxProgress = rr.getProgress(); } rr.close(); System.out.println("maxProgress=" + maxProgress + " Lastid=" + lastId + " nonZeroEntryCount=" + nonZeroEntryCount + " postings=" + pointers); } }