List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits
public List<InputSplit> getSplits(JobContext job) throws IOException
From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianInputFormat.java
License:Open Source License
private List<InputSplit> getInputSplits(JobContext jobContext, String inputFormatClass, Path path) throws ClassNotFoundException, IOException { Configuration conf = jobContext.getConfiguration(); FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass), conf);// w w w . ja va 2s.c o m // Set the input path for the left data set path = path.getFileSystem(conf).makeQualified(path); String dirStr = StringUtils.escapeString(path.toString()); String dirs = conf.get(INPUT_DIR); conf.set(INPUT_DIR, dirStr); return inputFormat.getSplits(jobContext); }
From source file:com.asakusafw.testdriver.file.FileInputFormatDriver.java
License:Apache License
/** * Creates a new instance./*from w ww . j av a 2 s . c o m*/ * @param context target context with source information * @param definition the data model definition * @param format the input format * @throws IOException if failed to initialize * @throws IllegalArgumentException if some parameters were {@code null} */ FileInputFormatDriver(DataModelDefinition<V> definition, TaskAttemptContext context, FileInputFormat<?, V> format) throws IOException { if (definition == null) { throw new IllegalArgumentException("definition must not be null"); //$NON-NLS-1$ } if (context == null) { throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$ } if (format == null) { throw new IllegalArgumentException("format must not be null"); //$NON-NLS-1$ } LOG.debug("Emulating InputFormat: {}", format.getClass().getName()); this.definition = definition; this.context = context; this.format = format; LOG.debug("Computing input splits: {}", format.getClass().getName()); this.splits = new LinkedList<>(format.getSplits(context)); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * Go through each original inputsplit, get its file path, and check the * index file,// w w w. j a v a 2s .c o m * a) keep it, when there is no index prebuilt on this file * (or the index file doesn't match with the base file's checksum; * b) remove it when no matching value is found in existing index file; * c) construct new smaller inputsplits using indexed blocks found * in the index file; */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { String inputformat = job.getConfiguration().get(REALINPUTFORMAT); String valueClass = job.getConfiguration().get(VALUECLASS); List<InputSplit> filteredList = new ArrayList<InputSplit>(); FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass); List<InputSplit> splits = realInputFormat.getSplits(job); //if indexing jobs, don't skip any input splits. //if searching job but no searching filter, skip the index as well. if (isIndexingJob(job) || getFilterCondition(job) == null) return splits; Path prevFile = null; // remember the last input file we saw boolean foundIndexedFile = false; // is there a index file for // prevFile? boolean firstTime = true; // is this the first time we see this file? long totalOriginalBytes = 0; //the bytes to be scanned without indexes. totalBytesNewSplits = 0; long startTime = System.currentTimeMillis(); LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes"); Configuration conf = job.getConfiguration(); long splitMaxSize; // for each original input split check if we can filter it out. for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); splitLength = fileSplit.getLength(); totalOriginalBytes += fileSplit.getLength(); splitMaxSize = Math.max(splitLength, conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024))); /* * for each new file we see, we first check if it has been indexed or not; * if not, we just add the original input split; if yes, we use the index * file to add filtered splits for the file */ if (prevFile != null && path.equals(prevFile)) { firstTime = false; } else { prevFile = path; firstTime = true; foundIndexedFile = foundIndexFile(job, path); } // if no index file, we'll have to read all original input // splits if (!foundIndexedFile) filteredList.add(fileSplit); else { // for each file we only add once its filtered input splits using index // file if (firstTime) { // LOG.info("first time saw " + path // + ", adding filtered splits from index file"); filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize)); } } } long endTime = System.currentTimeMillis(); LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: " + (endTime - startTime) / 1000); LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s", totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits))); return filteredList; }
From source file:gobblin.source.extractor.hadoop.HadoopFileInputSource.java
License:Apache License
@Override public List<WorkUnit> getWorkunits(SourceState state) { try {/* w ww . j a v a2s .com*/ Job job = Job.getInstance(new Configuration()); if (state.contains(FILE_INPUT_PATHS_KEY)) { for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) { FileInputFormat.addInputPath(job, new Path(inputPath)); } } FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration()); List<InputSplit> fileSplits = fileInputFormat.getSplits(job); if (fileSplits == null || fileSplits.isEmpty()) { return ImmutableList.of(); } Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType .valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null; String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY); List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size()); for (InputSplit inputSplit : fileSplits) { // Create one WorkUnit per InputSplit FileSplit fileSplit = (FileSplit) inputSplit; Extract extract = createExtract(tableType, tableNamespace, tableName); WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString()); workUnits.add(workUnit); } return workUnits; } catch (IOException ioe) { throw new RuntimeException("Failed to get workunits", ioe); } }
From source file:it.crs4.seal.tsv_sort.TextSampler.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 20 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param inFormat The input to sample//from w ww.j a v a2 s . c om * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT); List<InputSplit> splits = inFormat.getSplits(job); int samples = Math.min(MAX_SLICES_SAMPLED, splits.size()); long recordsPerSample = sampleSize / samples; int sampleStep = splits.size() / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { InputSplit isplit = splits.get(sampleStep * i); RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext); reader.initialize(isplit, taskContext); while (reader.nextKeyValue()) { sampler.addKey(reader.getCurrentKey()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) outFs.delete(partFile, false); SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
From source file:org.kitesdk.data.spi.filesystem.InputFormatReader.java
License:Apache License
@Override public void initialize() { Preconditions.checkState(ReaderWriterState.NEW.equals(state), "A reader may not be opened more than once - current state:%s", state); try {//from ww w .j a va 2 s . c o m FileInputFormat format = InputFormatUtil.newInputFormatInstance(descriptor); Job job = Hadoop.Job.newInstance.invoke(conf); FileInputFormat.addInputPath(job, path); // attempt to minimize the number of InputSplits FileStatus stat = fs.getFileStatus(path); FileInputFormat.setMaxInputSplitSize(job, stat.getLen()); this.splits = format.getSplits(job).iterator(); this.shouldAdvance = true; this.state = ReaderWriterState.OPEN; } catch (RuntimeException e) { this.state = ReaderWriterState.ERROR; throw new DatasetOperationException("Cannot calculate splits", e); } catch (IOException e) { this.state = ReaderWriterState.ERROR; throw new DatasetIOException("Cannot calculate splits", e); } }