List of usage examples for org.apache.hadoop.mapred InputFormat getSplits
InputSplit[] getSplits(JobConf job, int numSplits) throws IOException;
From source file:TestFormatStorageRecordReader.java
License:Open Source License
static RecordReader<WritableComparable, Writable> getRecReader(JobConf conf, String path) throws IOException { conf.set("mapred.input.dir", path); FileInputFormat.setInputPaths(conf, path); InputFormat inputFormat = new FormatStorageInputFormat(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); System.out.println("get Splits:" + inputSplits.length); return inputFormat.getRecordReader(inputSplits[0], conf, Reporter.NULL); }
From source file:Text2FormatStorageMR.java
License:Open Source License
@SuppressWarnings("unchecked") public static int readFormatFile(JobConf conf, String inputPath, int lineNum) throws Exception { RecordReader<WritableComparable, Writable> currRecReader; conf.set("mapred.input.dir", inputPath); InputFormat inputFormat = new FormatStorageInputFormat(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return -1; }/*from ww w.ja v a 2 s. co m*/ currRecReader = inputFormat.getRecordReader(inputSplits[0], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); int num = 0; while (true) { boolean ret = currRecReader.next(key, value); if (ret) { Text Line = (Text) key; System.out.println(Line.toString()); num++; if (num >= lineNum) break; } else break; } return 0; }
From source file:TestFormatStorageInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {/* w w w.j ava 2 s .c o m*/ if (argv.length != 2) { System.out.println("TestFormatStorageInputFormat <input> <output>"); System.exit(-1); } JobConf conf = new JobConf(TestFormatStorageInputFormat.class); conf.setJobName("TestFormatStorageInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(FormatStorageOutputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); Head head = new Head(); initHead(head); head.toJobConf(conf); FormatStorageSerDe serDe = initSerDe(conf); StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new FormatStorageInputFormat(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int size = inputSplits.length; System.out.println("getSplits return size:" + size); for (int i = 0; i < size; i++) { FormatStorageSplit split = (FormatStorageSplit) inputSplits[i]; System.out.printf("split:" + i + "offset:" + split.getStart() + "len:" + split.getLength() + "path:" + conf.get(ConstVar.InputPath) + "beginLine:" + split.getBeginLine() + "endLine:" + split.getEndLine() + "\n"); } { int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); long begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Record record = (Record) value; Object row = serDe.deserialize(record); count++; } long end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay); } } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:TestTextInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {//from w w w .ja v a 2s.c o m if (argv.length != 2) { System.out.println("TestTextInputFormat <input> <output>"); System.exit(-1); } JobConf conf = new JobConf(TestTextInputFormat.class); conf.setJobName("TestTextInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(FormatStorageOutputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); LazySimpleSerDe serDe = initSerDe(conf); LazySimpleStructObjectInspector oi = (LazySimpleStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new TextInputFormat(); ((TextInputFormat) inputFormat).configure(conf); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); long begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Object row = serDe.deserialize((Text) value); oi.getStructFieldsDataAsList(row); count++; } long end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay); return; } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:TestColumnStorageInputFormat.java
License:Open Source License
public static void main(String[] argv) throws IOException, SerDeException { try {/*from w w w . j ava2s.c om*/ if (argv.length != 2) { System.out.println("TestColumnStorageInputFormat <input> idx"); System.exit(-1); } JobConf conf = new JobConf(TestColumnStorageInputFormat.class); conf.setJobName("TestColumnStorageInputFormat"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Unit.Record.class); conf.setInputFormat(TextInputFormat.class); conf.set("mapred.output.compress", "flase"); conf.set("mapred.input.dir", argv[0]); conf.set("hive.io.file.readcolumn.ids", argv[1]); FormatStorageSerDe serDe = initSerDe(conf); StandardStructObjectInspector oi = (StandardStructObjectInspector) serDe.getObjectInspector(); List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs(); FileInputFormat.setInputPaths(conf, argv[0]); Path outputPath = new Path(argv[1]); FileOutputFormat.setOutputPath(conf, outputPath); InputFormat inputFormat = new ColumnStorageInputFormat(); long begin = System.currentTimeMillis(); InputSplit[] inputSplits = inputFormat.getSplits(conf, 1); long end = System.currentTimeMillis(); System.out.println("getsplit delay " + (end - begin) + " ms"); if (inputSplits.length == 0) { System.out.println("inputSplits is empty"); return; } else { System.out.println("get Splits:" + inputSplits.length); } int size = inputSplits.length; System.out.println("getSplits return size:" + size); for (int i = 0; i < size; i++) { ColumnStorageSplit split = (ColumnStorageSplit) inputSplits[i]; System.out.printf("split:" + i + " offset:" + split.getStart() + "len:" + split.getLength() + "path:" + split.getPath().toString() + "beginLine:" + split.getBeginLine() + "endLine:" + split.getEndLine()); if (split.getFileName() != null) { System.out.println("fileName:" + split.getFileName()); } else { System.out.println("fileName null"); } if (split.fileList() != null) { System.out.println("fileList.num:" + split.fileList().size()); for (int j = 0; j < split.fileList().size(); j++) { System.out.println("filelist " + j + ":" + split.fileList().get(j)); } } } while (true) { int totalDelay = 0; RecordReader<WritableComparable, Writable> currRecReader = null; for (int i = 0; i < inputSplits.length; i++) { currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); WritableComparable key; Writable value; key = currRecReader.createKey(); value = currRecReader.createValue(); begin = System.currentTimeMillis(); int count = 0; while (currRecReader.next(key, value)) { Record record = (Record) value; Object row = serDe.deserialize(record); count++; } end = System.currentTimeMillis(); long delay = (end - begin) / 1000; totalDelay += delay; System.out.println(count + " record read over, delay " + delay + " s"); } System.out.println("total delay:" + totalDelay + "\n"); } } catch (Exception e) { e.printStackTrace(); System.out.println("get exception:" + e.getMessage()); } }
From source file:cascading.tap.hadoop.HadoopMR1TapPlatformTest.java
License:Open Source License
@Test public void testCombinedHfs() throws Exception { getPlatform().copyFromLocal(inputFileLower); getPlatform().copyFromLocal(inputFileUpper); Hfs sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), InputData.inputFileLower); Hfs sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), InputData.inputFileUpper); // create a CombinedHfs instance on these files Tap source = new MultiSourceTap<Hfs, JobConf, RecordReader>(sourceLower, sourceUpper); FlowProcess<JobConf> process = getPlatform().getFlowProcess(); JobConf conf = process.getConfigCopy(); // set the combine flag conf.setBoolean(HfsProps.COMBINE_INPUT_FILES, true); conf.set("cascading.flow.platform", "hadoop"); // only supported on mr based platforms // test the input format and the split source.sourceConfInit(process, conf); InputFormat inputFormat = conf.getInputFormat(); assertEquals(Hfs.CombinedInputFormat.class, inputFormat.getClass()); InputSplit[] splits = inputFormat.getSplits(conf, 1); assertEquals(1, splits.length);//from w w w. jav a 2s . c o m validateLength(source.openForRead(process), 10); }
From source file:cascading.tap.hadoop.io.MultiInputFormat.java
License:Open Source License
private long[] getInputSplitSizes(InputFormat[] inputFormats, JobConf[] jobConfs, int numSplits) throws IOException { long[] inputSizes = new long[inputFormats.length]; for (int i = 0; i < inputFormats.length; i++) { InputFormat inputFormat = inputFormats[i]; InputSplit[] splits = inputFormat.getSplits(jobConfs[i], numSplits); inputSizes[i] = splits.length;//from w w w. j a v a 2s .c om } return inputSizes; }
From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java
License:Apache License
private void addSplitsForGroup(List<Path> dirs, TableScanOperator tableScan, JobConf conf, InputFormat inputFormat, Class<? extends InputFormat> inputFormatClass, int splits, TableDesc table, List<InputSplit> result) throws IOException { Utilities.copyTableJobPropertiesToConf(table, conf); // The table and database name to scan. // TODO: This is commented out until we have a pluggable way to configure the // SerDe. Until then, the create a separate table and set the job conf properties. // String fqTblName[] = table.getTableName().split("\\."); ///*from w w w . jav a 2 s.co m*/ // conf.set("recordservice.table.name", table.getTableName()); if (tableScan != null) { pushFilters(conf, tableScan); // Set the projected column and table info for the RecordServiceRecordReader. conf.set("recordservice.col.names", Joiner.on(",").join(tableScan.getNeededColumns())); } // Unset the file config. We're going to be just reading from the table. conf.unset(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR); conf.setInputFormat(inputFormat.getClass()); // Generate the RecordService InputSplit[] iss = inputFormat.getSplits(conf, splits); for (InputSplit is : iss) { if (is instanceof FileSplit) { FileSplit fileSplit = (FileSplit) is; LOG.info("INPUT SPLIT: " + fileSplit.getPath().toString()); } } // Wrap the InputSplits in HiveInputSplits. We use modified version of the // HiveInputSplit to work around some issues with the base one. // TODO: Get changes incorporated into Hive. for (InputSplit is : iss) { result.add(new HiveInputSplitShim(dirs.get(0), is, inputFormatClass.getName())); } }
From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java
License:Apache License
@Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed);//from w w w . j av a2 s . c o m // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); }
From source file:com.facebook.hive.orc.TestInputOutputFormat.java
License:Apache License
@Test public void testInOutFormat() throws Exception { Properties properties = new Properties(); StructObjectInspector inspector;//w w w. j ava 2s . com synchronized (TestOrcFile.class) { inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } SerDe serde = new OrcSerde(); HiveOutputFormat<?, ?> outFormat = new OrcOutputFormat(); FileSinkOperator.RecordWriter writer = outFormat.getHiveRecordWriter(conf, testFilePath, MyRow.class, true, properties, Reporter.NULL); ReaderWriterProfiler.setProfilerOptions(conf); writer.write(serde.serialize(new MyRow(1, 2), inspector)); writer.write(serde.serialize(new MyRow(2, 2), inspector)); writer.write(serde.serialize(new MyRow(3, 2), inspector)); writer.close(true); serde = new OrcSerde(); properties.setProperty("columns", "x,y"); properties.setProperty("columns.types", "int:int"); serde.initialize(conf, properties); assertEquals(OrcSerde.OrcSerdeRow.class, serde.getSerializedClass()); inspector = (StructObjectInspector) serde.getObjectInspector(); assertEquals("struct<x:int,y:int>", inspector.getTypeName()); InputFormat<?, ?> in = new OrcInputFormat(); FileInputFormat.setInputPaths(conf, testFilePath.toString()); InputSplit[] splits = in.getSplits(conf, 1); assertEquals(1, splits.length); // the the validate input method ArrayList<FileStatus> fileList = new ArrayList<FileStatus>(3); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(testFilePath)); assertEquals(true, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); fileList.add(fs.getFileStatus(workDir)); assertEquals(false, ((InputFormatChecker) in).validateInput(fs, new HiveConf(), fileList)); // read the whole file org.apache.hadoop.mapred.RecordReader reader = in.getRecordReader(splits[0], conf, Reporter.NULL); Object key = reader.createKey(); Writable value = (Writable) reader.createValue(); int rowNum = 0; List<? extends StructField> fields = inspector.getAllStructFieldRefs(); IntObjectInspector intInspector = (IntObjectInspector) fields.get(0).getFieldObjectInspector(); assertEquals(0.0, reader.getProgress(), 0.00001); assertEquals(0, reader.getPos()); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); assertEquals(1.0, reader.getProgress(), 0.00001); reader.close(); // read just the first column conf.set("hive.io.file.readcolumn.ids", "0"); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(null, inspector.getStructFieldData(value, fields.get(1))); } assertEquals(3, rowNum); reader.close(); // test the mapping of empty string to all columns conf.set("hive.io.file.readcolumn.ids", ""); reader = in.getRecordReader(splits[0], conf, Reporter.NULL); key = reader.createKey(); value = (Writable) reader.createValue(); rowNum = 0; fields = inspector.getAllStructFieldRefs(); while (reader.next(key, value)) { assertEquals(++rowNum, intInspector.get(inspector.getStructFieldData(value, fields.get(0)))); assertEquals(2, intInspector.get(inspector.getStructFieldData(serde.deserialize(value), fields.get(1)))); } assertEquals(3, rowNum); reader.close(); }