List of usage examples for org.apache.hadoop.mapred JobConf getInputFormat
public InputFormat getInputFormat()
From source file:crunch.MaxTemperature.java
License:Apache License
@Test @Ignore("See HADOOP-5588") public void directoryWithSubdirectoryUsingGlob() throws Exception { JobConf conf = new JobConf(); Path path = new Path(BASE_PATH, "dir/a*"); FileInputFormat.addInputPath(conf, path); InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1); assertThat(splits.length, is(1)); }//from w w w .j a v a 2 s . c om
From source file:crunch.MaxTemperature.java
License:Apache License
@Test @Ignore("Needs more investigation") public void recordsCoincideWithBlocks() throws IOException { int recordLength = 1024; Path input = new Path("input"); createFile(input, 12, recordLength); JobConf job = new JobConf(); job.set("fs.default.name", fs.getUri().toString()); FileInputFormat.addInputPath(job, input); InputFormat<LongWritable, Text> inputFormat = job.getInputFormat(); InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks()); assertThat(splits.length, is(3)); checkSplit(splits[0], 0, 4096);/*from ww w . j av a 2 s .com*/ checkSplit(splits[1], 4096, 4096); checkSplit(splits[2], 8192, 4096); checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4); checkRecordReader(inputFormat, splits[1], job, recordLength, 4, 8); checkRecordReader(inputFormat, splits[2], job, recordLength, 8, 12); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void recordsDontCoincideWithBlocks() throws IOException { int recordLength = 1024 + 512; Path input = new Path("input"); createFile(input, 8, recordLength); JobConf job = new JobConf(); job.set("fs.default.name", fs.getUri().toString()); FileInputFormat.addInputPath(job, input); InputFormat<LongWritable, Text> inputFormat = job.getInputFormat(); InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks()); System.out.println(Arrays.asList(splits)); checkSplit(splits[0], 0, 4096);/* w ww. j a v a2 s . c o m*/ checkSplit(splits[1], 4096, 4096); checkSplit(splits[2], 8192, 4096); checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 3); checkRecordReader(inputFormat, splits[1], job, recordLength, 3, 6); checkRecordReader(inputFormat, splits[2], job, recordLength, 6, 8); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test @Ignore("Needs more investigation") public void compression() throws IOException { int recordLength = 1024; Path input = new Path("input.bz2"); createFile(input, 24, recordLength); System.out.println(">>>>>>" + fs.getLength(input)); JobConf job = new JobConf(); job.set("fs.default.name", fs.getUri().toString()); FileInputFormat.addInputPath(job, input); InputFormat<LongWritable, Text> inputFormat = job.getInputFormat(); InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks()); System.out.println(Arrays.asList(splits)); assertThat(splits.length, is(2)); checkSplit(splits[0], 0, 4096);/*from w w w. j a va 2s .c o m*/ checkSplit(splits[1], 4096, 4096); checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4); checkRecordReader(inputFormat, splits[1], job, recordLength, 5, 12); }
From source file:edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory.java
License:Apache License
@Override public void configure(Map<String, String> configuration, ARecordType outputType) throws Exception { if (!initialized) { hdfsScheduler = initializeHDFSScheduler(); initialized = true;/*from w ww . jav a2s .com*/ } this.configuration = configuration; JobConf conf = configureJobConf(configuration); confFactory = new ConfFactory(conf); clusterLocations = getClusterLocations(); int numPartitions = ((AlgebricksAbsolutePartitionConstraint) clusterLocations).getLocations().length; // if files list was set, we restrict the splits to the list since this dataset is indexed InputSplit[] inputSplits; if (files == null) { inputSplits = conf.getInputFormat().getSplits(conf, numPartitions); } else { inputSplits = getSplits(conf); } inputSplitsFactory = new InputSplitsFactory(inputSplits); readSchedule = hdfsScheduler.getLocationConstraints(inputSplits); executed = new boolean[readSchedule.length]; Arrays.fill(executed, false); configured = true; atype = (IAType) outputType; configureFormat(atype); }
From source file:edu.uci.ics.asterix.test.runtime.HDFSCluster.java
License:Apache License
public static void main(String[] args) throws Exception { HDFSCluster cluster = new HDFSCluster(); cluster.setup();// w w w. j a v a 2 s. c o m JobConf conf = configureJobConf(); FileSystem fs = FileSystem.get(conf); InputSplit[] inputSplits = conf.getInputFormat().getSplits(conf, 0); for (InputSplit split : inputSplits) { System.out.println("split :" + split); } // cluster.cleanup(); }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopMapperOperatorDescriptor.java
License:Apache License
private Object getRecordReader(JobConf conf, Object inputSplit) throws ClassNotFoundException, IOException, InterruptedException { if (conf.getUseNewMapper()) { JobContext context = new ContextFactory().createJobContext(conf); org.apache.hadoop.mapreduce.InputFormat inputFormat = (org.apache.hadoop.mapreduce.InputFormat) ReflectionUtils .newInstance(context.getInputFormatClass(), conf); TaskAttemptContext taskAttemptContext = new ContextFactory().createContext(conf, new TaskAttemptID()); return inputFormat.createRecordReader((org.apache.hadoop.mapreduce.InputSplit) inputSplit, taskAttemptContext);//from w w w . j ava 2s. c om } else { Class inputFormatClass = conf.getInputFormat().getClass(); InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, conf); return inputFormat.getRecordReader((org.apache.hadoop.mapred.InputSplit) inputSplit, conf, super.createReporter()); } }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReadOperatorDescriptor.java
License:Apache License
public HadoopReadOperatorDescriptor(JobConf jobConf, JobSpecification spec, Object[] splits) throws IOException { super(spec, 0, 1); this.jobConfMap = DatatypeHelper.jobConf2Map(jobConf); InputFormat inputFormat = jobConf.getInputFormat(); RecordReader recordReader;/*from www.j av a 2s.c o m*/ try { recordReader = getRecordReader(DatatypeHelper.map2JobConf(jobConfMap), splits[0]); } catch (Exception e) { throw new IOException(e); } recordDescriptors[0] = DatatypeHelper.createKeyValueRecordDescriptor( (Class<? extends Writable>) recordReader.createKey().getClass(), (Class<? extends Writable>) recordReader.createValue().getClass()); PartitionConstraintHelper.addPartitionCountConstraint(spec, this, splits.length); inputSplitsProxy = new InputSplitsProxy(jobConf, splits); this.inputFormatClassName = inputFormat.getClass().getName(); }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReadOperatorDescriptor.java
License:Apache License
private RecordReader getRecordReader(JobConf conf, Object inputSplit) throws ClassNotFoundException, IOException, InterruptedException { RecordReader hadoopRecordReader = null; if (conf.getUseNewMapper()) { JobContext context = new ContextFactory().createJobContext(conf); org.apache.hadoop.mapreduce.InputFormat inputFormat = (org.apache.hadoop.mapreduce.InputFormat) ReflectionUtils .newInstance(context.getInputFormatClass(), conf); TaskAttemptContext taskAttemptContext = new ContextFactory().createContext(jobConf, null); hadoopRecordReader = (RecordReader) inputFormat .createRecordReader((org.apache.hadoop.mapreduce.InputSplit) inputSplit, taskAttemptContext); } else {/*from w w w. j av a2 s .c o m*/ Class inputFormatClass = conf.getInputFormat().getClass(); InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, conf); hadoopRecordReader = (RecordReader) inputFormat .getRecordReader((org.apache.hadoop.mapred.InputSplit) inputSplit, conf, createReporter()); } return hadoopRecordReader; }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReadOperatorDescriptor.java
License:Apache License
@SuppressWarnings("deprecation") @Override// w ww .ja v a 2s. c om public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, final IRecordDescriptorProvider recordDescProvider, final int partition, int nPartitions) throws HyracksDataException { return new AbstractUnaryOutputSourceOperatorNodePushable() { @Override public void initialize() throws HyracksDataException { try { JobConf conf = DatatypeHelper.map2JobConf((HashMap) jobConfMap); Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader()); conf.setClassLoader(this.getClass().getClassLoader()); RecordReader hadoopRecordReader; Object key; Object value; Object[] splits = inputSplitsProxy.toInputSplits(conf); Object inputSplit = splits[partition]; if (conf.getUseNewMapper()) { JobContext context = new ContextFactory().createJobContext(conf); org.apache.hadoop.mapreduce.InputFormat inputFormat = (org.apache.hadoop.mapreduce.InputFormat) ReflectionUtils .newInstance(context.getInputFormatClass(), conf); TaskAttemptContext taskAttemptContext = new ContextFactory().createContext(jobConf, null); hadoopRecordReader = (RecordReader) inputFormat.createRecordReader( (org.apache.hadoop.mapreduce.InputSplit) inputSplit, taskAttemptContext); } else { Class inputFormatClass = conf.getInputFormat().getClass(); InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, conf); hadoopRecordReader = (RecordReader) inputFormat.getRecordReader( (org.apache.hadoop.mapred.InputSplit) inputSplit, conf, createReporter()); } Class inputKeyClass; Class inputValueClass; if (hadoopRecordReader instanceof SequenceFileRecordReader) { inputKeyClass = ((SequenceFileRecordReader) hadoopRecordReader).getKeyClass(); inputValueClass = ((SequenceFileRecordReader) hadoopRecordReader).getValueClass(); } else { inputKeyClass = hadoopRecordReader.createKey().getClass(); inputValueClass = hadoopRecordReader.createValue().getClass(); } key = hadoopRecordReader.createKey(); value = hadoopRecordReader.createValue(); ByteBuffer outBuffer = ctx.allocateFrame(); FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize()); appender.reset(outBuffer, true); RecordDescriptor outputRecordDescriptor = DatatypeHelper.createKeyValueRecordDescriptor( (Class<? extends Writable>) hadoopRecordReader.createKey().getClass(), (Class<? extends Writable>) hadoopRecordReader.createValue().getClass()); int nFields = outputRecordDescriptor.getFieldCount(); ArrayTupleBuilder tb = new ArrayTupleBuilder(nFields); writer.open(); try { while (hadoopRecordReader.next(key, value)) { tb.reset(); switch (nFields) { case 2: tb.addField(outputRecordDescriptor.getFields()[0], key); case 1: tb.addField(outputRecordDescriptor.getFields()[1], value); } if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { FrameUtils.flushFrame(outBuffer, writer); appender.reset(outBuffer, true); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size (" + outBuffer.capacity() + ")"); } } } if (appender.getTupleCount() > 0) { FrameUtils.flushFrame(outBuffer, writer); } } catch (Exception e) { writer.fail(); throw new HyracksDataException(e); } finally { writer.close(); } hadoopRecordReader.close(); } catch (InstantiationException e) { throw new HyracksDataException(e); } catch (IllegalAccessException e) { throw new HyracksDataException(e); } catch (ClassNotFoundException e) { throw new HyracksDataException(e); } catch (InterruptedException e) { throw new HyracksDataException(e); } catch (IOException e) { throw new HyracksDataException(e); } } }; }