Example usage for org.apache.hadoop.mapred JobConf getInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getInputFormat.

Prototype

public InputFormat getInputFormat()

Source Link

Document

Get the InputFormat implementation for the map-reduce job, defaults to TextInputFormat if not specified explicity.

Usage

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    @Ignore("See HADOOP-5588")
    public void directoryWithSubdirectoryUsingGlob() throws Exception {
        JobConf conf = new JobConf();

        Path path = new Path(BASE_PATH, "dir/a*");
        FileInputFormat.addInputPath(conf, path);

        InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1);
        assertThat(splits.length, is(1));
    }//from w  w  w  .j a v a 2 s . c om

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    @Ignore("Needs more investigation")
    public void recordsCoincideWithBlocks() throws IOException {
        int recordLength = 1024;
        Path input = new Path("input");
        createFile(input, 12, recordLength);

        JobConf job = new JobConf();
        job.set("fs.default.name", fs.getUri().toString());
        FileInputFormat.addInputPath(job, input);
        InputFormat<LongWritable, Text> inputFormat = job.getInputFormat();
        InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks());

        assertThat(splits.length, is(3));
        checkSplit(splits[0], 0, 4096);/*from  ww  w .  j  av  a 2  s  .com*/
        checkSplit(splits[1], 4096, 4096);
        checkSplit(splits[2], 8192, 4096);

        checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4);
        checkRecordReader(inputFormat, splits[1], job, recordLength, 4, 8);
        checkRecordReader(inputFormat, splits[2], job, recordLength, 8, 12);
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void recordsDontCoincideWithBlocks() throws IOException {
        int recordLength = 1024 + 512;
        Path input = new Path("input");
        createFile(input, 8, recordLength);

        JobConf job = new JobConf();
        job.set("fs.default.name", fs.getUri().toString());
        FileInputFormat.addInputPath(job, input);
        InputFormat<LongWritable, Text> inputFormat = job.getInputFormat();
        InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks());

        System.out.println(Arrays.asList(splits));
        checkSplit(splits[0], 0, 4096);/* w ww. j  a v  a2 s . c  o  m*/
        checkSplit(splits[1], 4096, 4096);
        checkSplit(splits[2], 8192, 4096);

        checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 3);
        checkRecordReader(inputFormat, splits[1], job, recordLength, 3, 6);
        checkRecordReader(inputFormat, splits[2], job, recordLength, 6, 8);

    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    @Ignore("Needs more investigation")
    public void compression() throws IOException {
        int recordLength = 1024;
        Path input = new Path("input.bz2");
        createFile(input, 24, recordLength);
        System.out.println(">>>>>>" + fs.getLength(input));

        JobConf job = new JobConf();
        job.set("fs.default.name", fs.getUri().toString());
        FileInputFormat.addInputPath(job, input);
        InputFormat<LongWritable, Text> inputFormat = job.getInputFormat();
        InputSplit[] splits = inputFormat.getSplits(job, job.getNumMapTasks());

        System.out.println(Arrays.asList(splits));
        assertThat(splits.length, is(2));
        checkSplit(splits[0], 0, 4096);/*from   w w w.  j a  va  2s  .c  o  m*/
        checkSplit(splits[1], 4096, 4096);

        checkRecordReader(inputFormat, splits[0], job, recordLength, 0, 4);
        checkRecordReader(inputFormat, splits[1], job, recordLength, 5, 12);

    }

From source file:edu.uci.ics.asterix.external.adapter.factory.HDFSAdapterFactory.java

License:Apache License

@Override
public void configure(Map<String, String> configuration, ARecordType outputType) throws Exception {
    if (!initialized) {
        hdfsScheduler = initializeHDFSScheduler();
        initialized = true;/*from   w  ww .  jav  a2s  .com*/
    }
    this.configuration = configuration;
    JobConf conf = configureJobConf(configuration);
    confFactory = new ConfFactory(conf);

    clusterLocations = getClusterLocations();
    int numPartitions = ((AlgebricksAbsolutePartitionConstraint) clusterLocations).getLocations().length;

    // if files list was set, we restrict the splits to the list since this dataset is indexed
    InputSplit[] inputSplits;
    if (files == null) {
        inputSplits = conf.getInputFormat().getSplits(conf, numPartitions);
    } else {
        inputSplits = getSplits(conf);
    }
    inputSplitsFactory = new InputSplitsFactory(inputSplits);

    readSchedule = hdfsScheduler.getLocationConstraints(inputSplits);
    executed = new boolean[readSchedule.length];
    Arrays.fill(executed, false);
    configured = true;

    atype = (IAType) outputType;
    configureFormat(atype);
}

From source file:edu.uci.ics.asterix.test.runtime.HDFSCluster.java

License:Apache License

public static void main(String[] args) throws Exception {
    HDFSCluster cluster = new HDFSCluster();
    cluster.setup();// w w w.  j a v a 2  s. c  o m
    JobConf conf = configureJobConf();
    FileSystem fs = FileSystem.get(conf);
    InputSplit[] inputSplits = conf.getInputFormat().getSplits(conf, 0);
    for (InputSplit split : inputSplits) {
        System.out.println("split :" + split);
    }
    //   cluster.cleanup();
}

From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopMapperOperatorDescriptor.java

License:Apache License

private Object getRecordReader(JobConf conf, Object inputSplit)
        throws ClassNotFoundException, IOException, InterruptedException {
    if (conf.getUseNewMapper()) {
        JobContext context = new ContextFactory().createJobContext(conf);
        org.apache.hadoop.mapreduce.InputFormat inputFormat = (org.apache.hadoop.mapreduce.InputFormat) ReflectionUtils
                .newInstance(context.getInputFormatClass(), conf);
        TaskAttemptContext taskAttemptContext = new ContextFactory().createContext(conf, new TaskAttemptID());
        return inputFormat.createRecordReader((org.apache.hadoop.mapreduce.InputSplit) inputSplit,
                taskAttemptContext);//from   w w w .  j  ava  2s.  c om
    } else {
        Class inputFormatClass = conf.getInputFormat().getClass();
        InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, conf);
        return inputFormat.getRecordReader((org.apache.hadoop.mapred.InputSplit) inputSplit, conf,
                super.createReporter());
    }
}

From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReadOperatorDescriptor.java

License:Apache License

public HadoopReadOperatorDescriptor(JobConf jobConf, JobSpecification spec, Object[] splits)
        throws IOException {
    super(spec, 0, 1);
    this.jobConfMap = DatatypeHelper.jobConf2Map(jobConf);
    InputFormat inputFormat = jobConf.getInputFormat();
    RecordReader recordReader;/*from  www.j av a  2s.c  o m*/
    try {
        recordReader = getRecordReader(DatatypeHelper.map2JobConf(jobConfMap), splits[0]);
    } catch (Exception e) {
        throw new IOException(e);
    }
    recordDescriptors[0] = DatatypeHelper.createKeyValueRecordDescriptor(
            (Class<? extends Writable>) recordReader.createKey().getClass(),
            (Class<? extends Writable>) recordReader.createValue().getClass());
    PartitionConstraintHelper.addPartitionCountConstraint(spec, this, splits.length);
    inputSplitsProxy = new InputSplitsProxy(jobConf, splits);
    this.inputFormatClassName = inputFormat.getClass().getName();
}

From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReadOperatorDescriptor.java

License:Apache License

private RecordReader getRecordReader(JobConf conf, Object inputSplit)
        throws ClassNotFoundException, IOException, InterruptedException {
    RecordReader hadoopRecordReader = null;
    if (conf.getUseNewMapper()) {
        JobContext context = new ContextFactory().createJobContext(conf);
        org.apache.hadoop.mapreduce.InputFormat inputFormat = (org.apache.hadoop.mapreduce.InputFormat) ReflectionUtils
                .newInstance(context.getInputFormatClass(), conf);
        TaskAttemptContext taskAttemptContext = new ContextFactory().createContext(jobConf, null);
        hadoopRecordReader = (RecordReader) inputFormat
                .createRecordReader((org.apache.hadoop.mapreduce.InputSplit) inputSplit, taskAttemptContext);
    } else {/*from  w w w. j av  a2  s  .c o m*/
        Class inputFormatClass = conf.getInputFormat().getClass();
        InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, conf);
        hadoopRecordReader = (RecordReader) inputFormat
                .getRecordReader((org.apache.hadoop.mapred.InputSplit) inputSplit, conf, createReporter());
    }
    return hadoopRecordReader;
}

From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReadOperatorDescriptor.java

License:Apache License

@SuppressWarnings("deprecation")
@Override//  w ww .ja  v a  2s.  c om
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        final IRecordDescriptorProvider recordDescProvider, final int partition, int nPartitions)
        throws HyracksDataException {
    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        @Override
        public void initialize() throws HyracksDataException {
            try {
                JobConf conf = DatatypeHelper.map2JobConf((HashMap) jobConfMap);
                Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());
                conf.setClassLoader(this.getClass().getClassLoader());
                RecordReader hadoopRecordReader;
                Object key;
                Object value;
                Object[] splits = inputSplitsProxy.toInputSplits(conf);
                Object inputSplit = splits[partition];

                if (conf.getUseNewMapper()) {
                    JobContext context = new ContextFactory().createJobContext(conf);
                    org.apache.hadoop.mapreduce.InputFormat inputFormat = (org.apache.hadoop.mapreduce.InputFormat) ReflectionUtils
                            .newInstance(context.getInputFormatClass(), conf);
                    TaskAttemptContext taskAttemptContext = new ContextFactory().createContext(jobConf, null);
                    hadoopRecordReader = (RecordReader) inputFormat.createRecordReader(
                            (org.apache.hadoop.mapreduce.InputSplit) inputSplit, taskAttemptContext);
                } else {
                    Class inputFormatClass = conf.getInputFormat().getClass();
                    InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, conf);
                    hadoopRecordReader = (RecordReader) inputFormat.getRecordReader(
                            (org.apache.hadoop.mapred.InputSplit) inputSplit, conf, createReporter());
                }

                Class inputKeyClass;
                Class inputValueClass;
                if (hadoopRecordReader instanceof SequenceFileRecordReader) {
                    inputKeyClass = ((SequenceFileRecordReader) hadoopRecordReader).getKeyClass();
                    inputValueClass = ((SequenceFileRecordReader) hadoopRecordReader).getValueClass();
                } else {
                    inputKeyClass = hadoopRecordReader.createKey().getClass();
                    inputValueClass = hadoopRecordReader.createValue().getClass();
                }

                key = hadoopRecordReader.createKey();
                value = hadoopRecordReader.createValue();
                ByteBuffer outBuffer = ctx.allocateFrame();
                FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize());
                appender.reset(outBuffer, true);
                RecordDescriptor outputRecordDescriptor = DatatypeHelper.createKeyValueRecordDescriptor(
                        (Class<? extends Writable>) hadoopRecordReader.createKey().getClass(),
                        (Class<? extends Writable>) hadoopRecordReader.createValue().getClass());
                int nFields = outputRecordDescriptor.getFieldCount();
                ArrayTupleBuilder tb = new ArrayTupleBuilder(nFields);
                writer.open();
                try {
                    while (hadoopRecordReader.next(key, value)) {
                        tb.reset();
                        switch (nFields) {
                        case 2:
                            tb.addField(outputRecordDescriptor.getFields()[0], key);
                        case 1:
                            tb.addField(outputRecordDescriptor.getFields()[1], value);
                        }
                        if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            FrameUtils.flushFrame(outBuffer, writer);
                            appender.reset(outBuffer, true);
                            if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new HyracksDataException("Record size (" + tb.getSize()
                                        + ") larger than frame size (" + outBuffer.capacity() + ")");
                            }
                        }
                    }
                    if (appender.getTupleCount() > 0) {
                        FrameUtils.flushFrame(outBuffer, writer);
                    }
                } catch (Exception e) {
                    writer.fail();
                    throw new HyracksDataException(e);
                } finally {
                    writer.close();
                }
                hadoopRecordReader.close();
            } catch (InstantiationException e) {
                throw new HyracksDataException(e);
            } catch (IllegalAccessException e) {
                throw new HyracksDataException(e);
            } catch (ClassNotFoundException e) {
                throw new HyracksDataException(e);
            } catch (InterruptedException e) {
                throw new HyracksDataException(e);
            } catch (IOException e) {
                throw new HyracksDataException(e);
            }
        }
    };
}