List of usage examples for org.apache.hadoop.mapred FileInputFormat getSplits
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException
From source file:com.hdfs.concat.crush.CrushReducer.java
License:Apache License
@SuppressWarnings("unchecked") private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { LOG.info(format("Opening '%s'", inputPath)); Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList .get(idx);/*from ww w.j ava 2 s . co m*/ try { FileInputFormat.setInputPaths(job, inputPath); FileInputFormat<?, ?> instance = cls.newInstance(); if (instance instanceof JobConfigurable) { ((JobConfigurable) instance).configure(job); } InputSplit[] splits = instance.getSplits(job, 1); if (1 != splits.length) { throw new IllegalArgumentException("Could not get input splits: " + inputPath); } return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.m6d.filecrush.crush.CrushReducer.java
License:Apache License
@SuppressWarnings("unchecked") private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { LOG.info(format("Opening '%s'", inputPath)); Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx); try {//from www . j a v a 2 s.c o m FileInputFormat.setInputPaths(job, inputPath); FileInputFormat<?, ?> instance = cls.newInstance(); if (instance instanceof JobConfigurable) { ((JobConfigurable) instance).configure(job); } InputSplit[] splits = instance.getSplits(job, 1); if (1 != splits.length) { throw new IllegalArgumentException("Could not get input splits: " + inputPath); } return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource.java
License:Apache License
@Override public List<WorkUnit> getWorkunits(SourceState state) { JobConf jobConf = new JobConf(new Configuration()); for (String key : state.getPropertyNames()) { jobConf.set(key, state.getProp(key)); }/*from w w w . j a va 2s . c o m*/ if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) { FileInputFormat.addInputPath(jobConf, new Path(inputPath)); } } try { FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, jobConf); InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf, state.getPropAsInt(HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY, HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED)); if (fileSplits == null || fileSplits.length == 0) { return ImmutableList.of(); } Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY) ? Extract.TableType .valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase()) : null; String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY); String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY); List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.length); for (InputSplit inputSplit : fileSplits) { // Create one WorkUnit per InputSplit FileSplit fileSplit = (FileSplit) inputSplit; Extract extract = createExtract(tableType, tableNamespace, tableName); WorkUnit workUnit = WorkUnit.create(extract); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit)); workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString()); workUnits.add(workUnit); } return workUnits; } catch (IOException ioe) { throw new RuntimeException("Failed to get workunits", ioe); } }
From source file:org.pooledtimeseries.cartesian.CartesianInputFormat.java
License:Apache License
private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits) throws ClassNotFoundException, IOException { // Create a new instance of the input format FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass), conf);/*www . j ava 2s. co m*/ // Set the input path for the left data set inputFormat.setInputPaths(conf, inputPath); // Get the left input splits return inputFormat.getSplits(conf, numSplits); }
From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java
License:Apache License
private static List<String> collect(FileInputFormat<Text, Text> format, JobConf job, int n, Reporter reporter) throws IOException { List<String> found = new ArrayList<String>(); for (InputSplit split : format.getSplits(job, n)) { RecordReader<Text, Text> reader = format.getRecordReader(split, job, reporter); Text key = reader.createKey(); Text value = reader.createValue(); try {/*from w w w.ja v a 2 s . c o m*/ while (reader.next(key, value)) { found.add(key.toString()); } } finally { reader.close(); } } return found; }