List of usage examples for org.apache.hadoop.mapred RecordReader createValue
V createValue();
From source file:TestFormatStorageRecordReader.java
License:Open Source License
public static void main(String[] argv) throws IOException { try {// www .j a va 2 s.c o m String path1 = "se_test/fs/basic/f1/kt/"; String path2 = "se_test/fs/basic/f2/"; initFormatData(); JobConf conf1 = new JobConf(TestFormatStorageRecordReader.class); JobConf conf2 = new JobConf(TestFormatStorageRecordReader.class); FormatStorageSerDe serDe1 = initSerDe(conf1); FormatStorageSerDe serDe2 = initSerDe(conf2); StandardStructObjectInspector oi1 = (StandardStructObjectInspector) serDe1.getObjectInspector(); List<? extends StructField> fieldRefs1 = oi1.getAllStructFieldRefs(); StandardStructObjectInspector oi2 = (StandardStructObjectInspector) serDe2.getObjectInspector(); List<? extends StructField> fieldRefs2 = oi2.getAllStructFieldRefs(); InputFormat inputFormat = new FormatStorageInputFormat(); RecordReader<WritableComparable, Writable> currRecReader1 = getRecReader(conf1, path1); WritableComparable key; Writable value; key = currRecReader1.createKey(); value = currRecReader1.createValue(); System.out.println("currRecReader1. output...."); while (currRecReader1.next(key, value)) { ((Record) value).show(); System.out.println("end value.show"); Object row = serDe1.deserialize((Record) value); Record record = (Record) serDe1.serialize(row, oi1); record.show(); } /* RecordReader<WritableComparable, Writable> currRecReader2 = getRecReader(conf2, path2); key = currRecReader2.createKey(); value = currRecReader2.createValue(); System.out.println("currRecReader2. output...."); while (currRecReader2.next(key, value)) { ((Record)value).show(); } RecordReader<WritableComparable, Writable> currRecReader3 = getRecReader(conf1, path1); key = currRecReader3.createKey(); value = currRecReader3.createValue(); System.out.println("currRecReader3. output...."); while (currRecReader3.next(key, value)) { ((Record)value).show(); } */ } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:RunText.java
License:Apache License
@Override public void run() { try {/*from w w w .java 2 s . c o m*/ JobConf job = new JobConf(); job.setInputFormat(format.getClass()); RecordReader<LongWritable, Text> reader = format.getRecordReader(split, job, Reporter.NULL); Text value = reader.createValue(); LongWritable key = reader.createKey(); int count = 0; long t1 = System.nanoTime(); while (reader.next(key, value)) { List<String> values = parse(value); if (values.get(index).equals(toFind)) { System.out.println(value); } count++; if (count == 100) { totalCount.addAndGet(100); count = 0; } } } catch (Exception e) { throw new RuntimeException(e); } finally { runningThreads.decrementAndGet(); } }
From source file:Text2FormatStorageMR.java
License:Open Source License
@SuppressWarnings("unchecked") public static int readFormatFile(JobConf conf, String inputPath, int lineNum) throws Exception { RecordReader<WritableComparable, Writable> currRecReader; conf.set("mapred.input.dir", inputPath); InputFormat inputFormat = new FormatStorageInputFormat(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return -1; }// ww w . j av a2s.c o m currRecReader = inputFormat.getRecordReader(inputSplits[0], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); int num = 0; while (true) { boolean ret = currRecReader.next(key, value); if (ret) { Text Line = (Text) key; System.out.println(Line.toString()); num++; if (num >= lineNum) break; } else break; } return 0; }
From source file:TestFormatStorageInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {/* w w w . java 2s . c o m*/ if (argv.length != 2) { System.out.println("TestFormatStorageInputFormat <input> <output>"); System.exit(-1); } JobConf conf = new JobConf(TestFormatStorageInputFormat.class); conf.setJobName("TestFormatStorageInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(FormatStorageOutputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); Head head = new Head(); initHead(head); head.toJobConf(conf); FormatStorageSerDe serDe = initSerDe(conf); StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new FormatStorageInputFormat(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int size = inputSplits.length; System.out.println("getSplits return size:" + size); for (int i = 0; i < size; i++) { FormatStorageSplit split = (FormatStorageSplit) inputSplits[i]; System.out.printf("split:" + i + "offset:" + split.getStart() + "len:" + split.getLength() + "path:" + conf.get(ConstVar.InputPath) + "beginLine:" + split.getBeginLine() + "endLine:" + split.getEndLine() + "\n"); } { int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); long begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Record record = (Record) value; Object row = serDe.deserialize(record); count++; } long end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay); } } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:TestTextInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {// ww w . j a va 2 s . c om if (argv.length != 2) { System.out.println("TestTextInputFormat <input> <output>"); System.exit(-1); } JobConf conf = new JobConf(TestTextInputFormat.class); conf.setJobName("TestTextInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(FormatStorageOutputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); LazySimpleSerDe serDe = initSerDe(conf); LazySimpleStructObjectInspector oi = (LazySimpleStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new TextInputFormat(); ((TextInputFormat) inputFormat).configure(conf); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); long begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Object row = serDe.deserialize((Text) value); oi.getStructFieldsDataAsList(row); count++; } long end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay); return; } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:TestColumnStorageInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {/* ww w .j av a 2s. c o m*/ if (argv.length != 2) { System.out.println("TestColumnStorageInputFormat <input> idx"); System.exit(-1); } JobConf conf = new JobConf(TestColumnStorageInputFormat.class); conf.setJobName("TestColumnStorageInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); conf.set("hive.io.file.readcolumn.ids", argv[1]); FormatStorageSerDe serDe = initSerDe(conf); StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new ColumnStorageInputFormat(); long begin = System.currentTimeMillis(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); long end = System.currentTimeMillis(); System.out.println("getsplit delay " + (end - begin) + " ms"); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int size = inputSplits.length; System.out.println("getSplits return size:" + size); for (int i = 0; i < size; i++) { ColumnStorageSplit split = (ColumnStorageSplit) inputSplits[i]; System.out.printf("split:" + i + " offset:" + split.getStart() + "len:" + split.getLength() + "path:" + split.getPath().toString() + "beginLine:" + split.getBeginLine() + "endLine:" + split.getEndLine()); if (split.getFileName() != null) { System.out.println("fileName:" + split.getFileName()); } else { System.out.println("fileName null"); } if (split.fileList() != null) { System.out.println("fileList.num:" + split.fileList().size()); for (int j = 0; j < split.fileList().size(); j++) { System.out.println("filelist " + j + ":" + split.fileList().get(j)); } } } while (true) { int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Record record = (Record) value; Object row = serDe.deserialize(record); count++; } end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay + "\n"); } } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * Source method to take an incoming Avro record and make it a Tuple. * * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically. * @param sourceCall The cascading SourceCall object. Should be passed in by cascading automatically. * @return boolean true on successful parsing and collection, false on failure. * @throws IOException// w ww . j av a 2 s .c o m */ @Override public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { @SuppressWarnings("unchecked") RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput(); AvroWrapper<IndexedRecord> wrapper = input.createKey(); if (!input.next(wrapper, input.createValue())) { return false; } IndexedRecord record = wrapper.datum(); Tuple tuple = sourceCall.getIncomingEntry().getTuple(); tuple.clear(); Object[] split = AvroToCascading.parseRecord(record, schema); tuple.addAll(split); return true; }
From source file:cascading.scheme.DeprecatedAvroScheme.java
License:Apache License
/** * Source method to take an incoming Avro record and make it a Tuple. * * @param flowProcess The cascading FlowProcess object. Should be passed in by cascading automatically. * @param sourceCall The cascading SourceCall object. Should be passed in by cascading automatically. * @return boolean true on successful parsing and collection, false on failure. * @throws IOException//from ww w.j a va 2 s . c om */ @Override public boolean source(FlowProcess<? extends Configuration> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException { @SuppressWarnings("unchecked") RecordReader<AvroWrapper<IndexedRecord>, Writable> input = sourceCall.getInput(); AvroWrapper<IndexedRecord> wrapper = input.createKey(); if (!input.next(wrapper, input.createValue())) { return false; } IndexedRecord record = wrapper.datum(); Tuple tuple = sourceCall.getIncomingEntry().getTuple(); tuple.clear(); Object[] split = AvroToCascading.parseRecord(record, schema); tuple.addAll(split); return true; }
From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java
License:Apache License
@Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed);/*w w w . j a v a 2 s . c om*/ // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector;//from ww w . j a va 2 s . c om synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); ReaderWriterProfiler.setProfilerOptions(conf); writer.write(serde.serialize(new MyRow(1, 2), inspector)); writer.write(serde.serialize(new MyRow(2, 2), inspector)); writer.write(serde.serialize(new MyRow(3, 2), inspector)); writer.close(true); serde = new OrcSerde(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); serde.initialize(conf, properties); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // the the validate input method ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(testFilePath)); assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(workDir)); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); // read the whole file org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); assertEquals(1.0, reader.getProgress(), 0.00001); reader.close(); // read just the first column conf.set("hive.io.file.readcolumn.ids", "0"); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(null, inspector.getStructFieldData(value, fields.get(1))); } assertEquals(3, rowNum); reader.close(); // test the mapping of empty string to all columns conf.set("hive.io.file.readcolumn.ids", ""); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); reader.close(); }