List of usage examples for org.apache.hadoop.mapred JobConf getInputFormat
public InputFormat getInputFormat()
From source file:org.apache.ignite.internal.processors.hadoop.v1.GridHadoopV1Splitter.java
License:Apache License
/** * @param jobConf Job configuration.//from ww w . j a v a2 s.c o m * @return Collection of mapped splits. * @throws IgniteCheckedException If mapping failed. */ public static Collection<GridHadoopInputSplit> splitJob(JobConf jobConf) throws IgniteCheckedException { try { InputFormat<?, ?> format = jobConf.getInputFormat(); assert format != null; InputSplit[] splits = format.getSplits(jobConf, 0); Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.length); for (int i = 0; i < splits.length; i++) { InputSplit nativeSplit = splits[i]; if (nativeSplit instanceof FileSplit) { FileSplit s = (FileSplit) nativeSplit; res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(), s.getLength())); } else res.add(GridHadoopUtils.wrapSplit(i, nativeSplit, nativeSplit.getLocations())); } return res; } catch (IOException e) { throw new IgniteCheckedException(e); } }
From source file:org.apache.ignite.internal.processors.hadoop.v1.HadoopV1MapTask.java
License:Apache License
/** {@inheritDoc} */ @SuppressWarnings("unchecked") @Override//from w w w . j a v a2s .c om public void run(HadoopTaskContext taskCtx) throws IgniteCheckedException { HadoopJob job = taskCtx.job(); HadoopV2TaskContext ctx = (HadoopV2TaskContext) taskCtx; JobConf jobConf = ctx.jobConf(); InputFormat inFormat = jobConf.getInputFormat(); HadoopInputSplit split = info().inputSplit(); InputSplit nativeSplit; if (split instanceof HadoopFileBlock) { HadoopFileBlock block = (HadoopFileBlock) split; nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), EMPTY_HOSTS); } else nativeSplit = (InputSplit) ctx.getNativeSplit(split); assert nativeSplit != null; Reporter reporter = new HadoopV1Reporter(taskCtx); HadoopV1OutputCollector collector = null; try { collector = collector(jobConf, ctx, !job.info().hasCombiner() && !job.info().hasReducer(), fileName(), ctx.attemptId()); RecordReader reader = inFormat.getRecordReader(nativeSplit, jobConf, reporter); Mapper mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(), jobConf); Object key = reader.createKey(); Object val = reader.createValue(); assert mapper != null; try { try { while (reader.next(key, val)) { if (isCancelled()) throw new HadoopTaskCancelledException("Map task cancelled."); mapper.map(key, val, collector, reporter); } } finally { mapper.close(); } } finally { collector.closeWriter(); } collector.commit(); } catch (Exception e) { if (collector != null) collector.abort(); throw new IgniteCheckedException(e); } }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected void runJob(JobConf job) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted);//from w w w . ja v a2 s . c o m int numTrees = Builder.getNbTrees(job); // total number of trees firstOutput = new PartialOutputCollector(numTrees); Reporter reporter = Reporter.NULL; firstIds = new int[splits.length]; sizes = new int[splits.length]; // to compute firstIds, process the splits in file order int firstId = 0; long slowest = 0; // duration of slowest map for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.next(key, value)) { mapper.map(key, value, firstOutput, reporter); firstId++; sizes[hp]++; } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition//w w w . j a va2 s .c om * * @throws IOException * */ void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); Builder.sortSplits(splits); int numTrees = Builder.getNbTrees(job); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < splits.length; p++) { total += Step2Mapper.nbConcerned(splits.length, numTrees, p); } secondOutput = new PartialOutputCollector(total); Reporter reporter = Reporter.NULL; long slowest = 0; // duration of slowest map for (int partition = 0; partition < splits.length; partition++) { InputSplit split = splits[partition]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(job); int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.next(key, value)) { mapper.map(key, value, secondOutput, reporter); } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); JobConf job = new JobConf(); job.setNumMapTasks(numMaps);//from w w w . j av a 2s . co m FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Step0OutputCollector collector = new Step0OutputCollector(numMaps); Reporter reporter = Reporter.NULL; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); } mapper.map(key, value, collector, reporter); size++; } mapper.close(); // validate the mapper's output assertEquals(p, collector.keys[p]); assertEquals(firstKey.longValue(), collector.values[p].getFirstId()); assertEquals(size, collector.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, numAttributes); double[][] source = Utils.randomDoubles(rng, descriptor, numInstances); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < numInstances; index++) { source[index][labelId] = index;//w ww. j a v a2s. c om } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); JobConf job = new JobConf(); job.setNumMapTasks(numMaps); FileInputFormat.setInputPaths(job, dataPath); // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, numMaps); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted); Reporter reporter = Reporter.NULL; int[] keys = new int[numMaps]; Step0Output[] values = new Step0Output[numMaps]; int[] expectedIds = new int[numMaps]; for (int p = 0; p < numMaps; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Long firstKey = null; int size = 0; while (reader.next(key, value)) { if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).label; } size++; } keys[p] = p; values[p] = new Step0Output(firstKey, size); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.reef.io.data.loading.impl.AbstractEvaluatorToPartitionStrategy.java
License:Apache License
@SuppressWarnings("rawtypes") AbstractEvaluatorToPartitionStrategy(final String inputFormatClassName, final Set<String> serializedDataPartitions) { LOG.fine("AbstractEvaluatorToPartitionStrategy injected"); Validate.notEmpty(inputFormatClassName); Validate.notEmpty(serializedDataPartitions); locationToSplits = new ConcurrentHashMap<>(); evaluatorToSplits = new ConcurrentHashMap<>(); unallocatedSplits = new LinkedBlockingQueue<>(); setUp();// ww w .ja va 2 s . co m final Map<DistributedDataSetPartition, InputSplit[]> splitsPerPartition = new HashMap<>(); for (final String serializedDataPartition : serializedDataPartitions) { final DistributedDataSetPartition dp = DistributedDataSetPartitionSerializer .deserialize(serializedDataPartition); final ExternalConstructor<JobConf> jobConfExternalConstructor = new JobConfExternalConstructor( inputFormatClassName, dp.getPath()); try { final JobConf jobConf = jobConfExternalConstructor.newInstance(); final InputFormat inputFormat = jobConf.getInputFormat(); final InputSplit[] inputSplits = inputFormat.getSplits(jobConf, dp.getDesiredSplits()); if (LOG.isLoggable(Level.FINEST)) { LOG.log(Level.FINEST, "Splits for partition: {0} {1}", new Object[] { dp, Arrays.toString(inputSplits) }); } this.totalNumberOfSplits += inputSplits.length; splitsPerPartition.put(dp, inputSplits); } catch (final IOException e) { throw new RuntimeException("Unable to get InputSplits using the specified InputFormat", e); } } init(splitsPerPartition); LOG.log(Level.FINE, "Total Number of splits: {0}", this.totalNumberOfSplits); }
From source file:org.apache.reef.io.data.loading.impl.InputFormatExternalConstructor.java
License:Apache License
@Inject public InputFormatExternalConstructor(final JobConf jobConf) { this.jobConf = jobConf; inputFormat = jobConf.getInputFormat(); }
From source file:org.apache.tez.mapreduce.hadoop.MRInputHelpers.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(JobConf jobConf, boolean groupSplits, int numTasks) throws IOException { // This is the real InputFormat org.apache.hadoop.mapred.InputFormat inputFormat; try {/*w w w . ja v a 2 s . c om*/ inputFormat = jobConf.getInputFormat(); } catch (Exception e) { throw new TezUncheckedException(e); } org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat; if (groupSplits) { org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat(); groupedFormat.setConf(jobConf); groupedFormat.setInputFormat(inputFormat); groupedFormat.setDesiredNumberOfSplits(numTasks); finalInputFormat = groupedFormat; } else { finalInputFormat = inputFormat; } org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat.getSplits(jobConf, jobConf.getNumMapTasks()); // sort the splits into order based on size, so that the biggest // go first Arrays.sort(splits, new OldInputSplitComparator()); return splits; }
From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java
License:Apache License
private Set<ConfigurationTuple> getConfigurationTuples() throws IOException { if (confTuples != null) return confTuples; Path inputPath = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH)); FileSystem fs = FileSystem.get(conf); FileStatus f = fs.getFileStatus(inputPath); //BlockLocation[] bl = fs.getFileBlockLocations(p, 0, f.getLen()); Set<ConfigurationTuple> configTuples = new HashSet<>(); int workerId = 0; JobConf job = new JobConf(new Configuration()); job.setInputFormat((Class<? extends InputFormat>) this.inputFormatClass); //TextInputFormat.class); FileInputFormat.setInputPaths(job, inputPath); InputSplit[] splits = job.getInputFormat().getSplits(job, job.getNumMapTasks()); for (InputSplit split : splits) { FileSplit convertedToMetronomeSplit = new FileSplit(); org.apache.hadoop.mapred.FileSplit hadoopFileSplit = (org.apache.hadoop.mapred.FileSplit) split; if (hadoopFileSplit.getLength() - hadoopFileSplit.getStart() > 0) { convertedToMetronomeSplit.setLength(hadoopFileSplit.getLength()); convertedToMetronomeSplit.setOffset(hadoopFileSplit.getStart()); convertedToMetronomeSplit.setPath(hadoopFileSplit.getPath().toString()); StartupConfiguration config = StartupConfiguration.newBuilder().setBatchSize(batchSize) .setIterations(iterationCount).setOther(appConfig).setSplit(convertedToMetronomeSplit) .build();//from w w w. j av a 2s.co m String wid = "worker-" + workerId; ConfigurationTuple tuple = new ConfigurationTuple(split.getLocations()[0], wid, config); configTuples.add(tuple); workerId++; LOG.info("IR_AM_worker: " + wid + " added split: " + convertedToMetronomeSplit.toString()); } else { LOG.info("IR_AM: Culled out 0 length Split: " + convertedToMetronomeSplit.toString()); } } LOG.info("Total Splits/Workers: " + configTuples.size()); confTuples = configTuples; return configTuples; }