Example usage for org.apache.hadoop.mapred JobConf getInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getInputFormat.

Prototype

public InputFormat getInputFormat()

Source Link

Document

Get the InputFormat implementation for the map-reduce job, defaults to TextInputFormat if not specified explicity.

Usage

From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java

License:Apache License

private InputSplit[] getOldInputSplits(JobConf conf) throws IOException {
    InputFormat inputFormat = conf.getInputFormat();
    return inputFormat.getSplits(conf, conf.getNumMapTasks());
}

From source file:edu.uci.ics.hyracks.hdfs.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final InputSplit[] inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();

        @SuppressWarnings("unchecked")
        @Override/*from  w w  w  .  ja v a  2s.  com*/
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                JobConf conf = confFactory.getConf();
                conf.setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                writer.open();
                parser.open(writer);
                InputFormat inputFormat = conf.getInputFormat();
                for (int i = 0; i < inputSplits.length; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read
                         * synchronize among simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);
                        Object key = reader.createKey();
                        Object value = reader.createValue();
                        while (reader.next(key, value) == true) {
                            parser.parse(key, value, writer, inputSplits[i].toString());
                        }
                    }
                }
                parser.close(writer);
                writer.close();
            } catch (Exception e) {
                throw new HyracksDataException(e);
            } finally {
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

From source file:edu.uci.ics.hyracks.imru.dataflow.Hdtest.java

License:Apache License

public static JobSpecification createJob() throws Exception {
    JobSpecification spec = new JobSpecification();
    spec.setFrameSize(4096);//from ww  w  .  j av a 2s . co  m

    String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf";
    String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0";
    JobConf conf = new JobConf();
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml"));
    conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml"));
    FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH);
    conf.setInputFormat(TextInputFormat.class);
    RecordDescriptor recordDesc = new RecordDescriptor(
            new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() });
    InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1);
    HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits,
            new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() {
                @Override
                public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) {
                    return new IKeyValueParser<LongWritable, Text>() {
                        TupleWriter tupleWriter;

                        @Override
                        public void open(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter = new TupleWriter(ctx, writer, 1);
                        }

                        @Override
                        public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString)
                                throws HyracksDataException {
                            try {
                                tupleWriter.write(value.getBytes(), 0, value.getLength());
                                tupleWriter.finishField();
                                tupleWriter.finishTuple();
                            } catch (IOException e) {
                                throw new HyracksDataException(e);
                            }
                        }

                        @Override
                        public void close(IFrameWriter writer) throws HyracksDataException {
                            tupleWriter.close();
                        }
                    };
                }

            });

    // createPartitionConstraint(spec, readOperator, new String[] {"NC0"});
    PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" });

    IOperatorDescriptor writer = new HDFSOD(spec, null, null, null);
    // createPartitionConstraint(spec, writer, outSplits);

    spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0);

    spec.addRoot(writer);
    return spec;
}

From source file:edu.uci.ics.hyracks.imru.jobgen.IMRUJobFactory.java

License:Apache License

public InputSplit[] getInputSplits() throws IOException {
    JobConf conf = getConf();
    FileInputFormat.setInputPaths(conf, inputPaths);
    conf.setInputFormat(HDFSBlockFormat.class);
    return conf.getInputFormat().getSplits(conf, 1);
}

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

private static void setupPipesJob(JobConf conf) throws IOException {
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        conf.setMapRunnerClass(PipesMapRunner.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, conf.getPartitionerClass());
        conf.setPartitionerClass(PipesPartitioner.class);
    }/*from   w  w w .j  ava  2s .  co m*/
    if (!getIsJavaReducer(conf)) {
        conf.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            conf.setOutputFormat(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class);
        conf.setInputFormat(PipesNonJavaInputFormat.class);
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        throw new IllegalArgumentException("No application program defined.");
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task 
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        IOException ie = new IOException("Problem parsing execable URI " + exec);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:org.apache.asterix.external.input.HDFSDataSourceFactory.java

License:Apache License

@Override
public void configure(Map<String, String> configuration) throws AsterixException {
    try {/*from  w  w  w.j a v a 2  s .co m*/
        init();
        this.configuration = configuration;
        JobConf conf = HDFSUtils.configureHDFSJobConf(configuration);
        confFactory = new ConfFactory(conf);
        clusterLocations = getPartitionConstraint();
        int numPartitions = clusterLocations.getLocations().length;
        // if files list was set, we restrict the splits to the list
        InputSplit[] inputSplits;
        if (files == null) {
            inputSplits = conf.getInputFormat().getSplits(conf, numPartitions);
        } else {
            inputSplits = HDFSUtils.getSplits(conf, files);
        }
        if (indexingOp) {
            readSchedule = indexingScheduler.getLocationConstraints(inputSplits);
        } else {
            readSchedule = hdfsScheduler.getLocationConstraints(inputSplits);
        }
        inputSplitsFactory = new InputSplitsFactory(inputSplits);
        read = new boolean[readSchedule.length];
        Arrays.fill(read, false);
        String formatString = configuration.get(ExternalDataConstants.KEY_FORMAT);
        if (formatString == null || formatString.equals(ExternalDataConstants.FORMAT_HDFS_WRITABLE)) {
            RecordReader<?, ?> reader = conf.getInputFormat().getRecordReader(inputSplits[0], conf,
                    Reporter.NULL);
            this.recordClass = reader.createValue().getClass();
            reader.close();
        } else {
            format = StreamRecordReaderProvider.getReaderFormat(configuration);
            this.recordClass = char[].class;
        }
    } catch (IOException e) {
        throw new AsterixException(e);
    }
}

From source file:org.apache.asterix.external.input.record.reader.hdfs.HDFSRecordReader.java

License:Apache License

public HDFSRecordReader(boolean read[], InputSplit[] inputSplits, String[] readSchedule, String nodeName,
        JobConf conf, List<ExternalFile> snapshot, IExternalIndexer indexer) throws IOException {
    this.read = read;
    this.inputSplits = inputSplits;
    this.readSchedule = readSchedule;
    this.nodeName = nodeName;
    this.conf = conf;
    this.inputFormat = conf.getInputFormat();
    this.reader = new EmptyRecordReader<K, Writable>();
    this.record = new GenericRecord<Writable>();
    this.indexer = indexer;
    this.snapshot = snapshot;
    this.hdfs = FileSystem.get(conf);
    nextInputSplit();/*from   ww  w. j  a  v  a  2 s . c o  m*/
}

From source file:org.apache.asterix.external.input.stream.HDFSInputStream.java

License:Apache License

@SuppressWarnings("unchecked")
public HDFSInputStream(boolean read[], InputSplit[] inputSplits, String[] readSchedule, String nodeName,
        JobConf conf, Map<String, String> configuration, List<ExternalFile> snapshot, IExternalIndexer indexer)
        throws IOException, AsterixException {
    this.read = read;
    this.inputSplits = inputSplits;
    this.readSchedule = readSchedule;
    this.nodeName = nodeName;
    this.conf = conf;
    this.inputFormat = conf.getInputFormat();
    this.reader = new EmptyRecordReader<Object, Text>();
    this.snapshot = snapshot;
    this.hdfs = FileSystem.get(conf);
    this.indexer = indexer;
    nextInputSplit();//from www  .j av  a 2  s  .c  o m
    this.value = new Text();
    if (snapshot != null) {
        if (currentSplitIndex < snapshot.size()) {
            indexer.reset(this);
        }
    }
}

From source file:org.apache.asterix.test.runtime.HDFSCluster.java

License:Apache License

public static void main(String[] args) throws Exception {
    HDFSCluster cluster = new HDFSCluster();
    cluster.setup();/*from  ww w.  ja  v  a  2 s.  c  om*/
    JobConf conf = configureJobConf();
    InputSplit[] inputSplits = conf.getInputFormat().getSplits(conf, 0);
    for (InputSplit split : inputSplits) {
        System.out.println("split :" + split);
    }
}

From source file:org.apache.drill.exec.store.hive.HiveAbstractReader.java

License:Apache License

private void init() throws ExecutionSetupException {
    final JobConf job = new JobConf(hiveConf);

    // Get the configured default val
    defaultPartitionValue = hiveConf.get(ConfVars.DEFAULTPARTITIONNAME.varname);

    Properties tableProperties;//  w  ww  . ja v  a  2s  .c  om
    try {
        tableProperties = HiveUtilities.getTableMetadata(table);
        final Properties partitionProperties = (partition == null) ? tableProperties
                : HiveUtilities.getPartitionMetadata(partition, table);
        HiveUtilities.addConfToJob(job, partitionProperties);

        final SerDe tableSerDe = createSerDe(job, table.getSd().getSerdeInfo().getSerializationLib(),
                tableProperties);
        final StructObjectInspector tableOI = getStructOI(tableSerDe);

        if (partition != null) {
            partitionSerDe = createSerDe(job, partition.getSd().getSerdeInfo().getSerializationLib(),
                    partitionProperties);
            partitionOI = getStructOI(partitionSerDe);

            finalOI = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI(partitionOI, tableOI);
            partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partitionOI, finalOI);
            job.setInputFormat(HiveUtilities.getInputFormatClass(job, partition.getSd(), table));
        } else {
            // For non-partitioned tables, there is no need to create converter as there are no schema changes expected.
            partitionSerDe = tableSerDe;
            partitionOI = tableOI;
            partTblObjectInspectorConverter = null;
            finalOI = tableOI;
            job.setInputFormat(HiveUtilities.getInputFormatClass(job, table.getSd(), table));
        }

        if (logger.isTraceEnabled()) {
            for (StructField field : finalOI.getAllStructFieldRefs()) {
                logger.trace("field in finalOI: {}", field.getClass().getName());
            }
            logger.trace("partitionSerDe class is {} {}", partitionSerDe.getClass().getName());
        }
        // Get list of partition column names
        final List<String> partitionNames = Lists.newArrayList();
        for (FieldSchema field : table.getPartitionKeys()) {
            partitionNames.add(field.getName());
        }

        // We should always get the columns names from ObjectInspector. For some of the tables (ex. avro) metastore
        // may not contain the schema, instead it is derived from other sources such as table properties or external file.
        // SerDe object knows how to get the schema with all the config and table properties passed in initialization.
        // ObjectInspector created from the SerDe object has the schema.
        final StructTypeInfo sTypeInfo = (StructTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(finalOI);
        final List<String> tableColumnNames = sTypeInfo.getAllStructFieldNames();

        // Select list of columns for project pushdown into Hive SerDe readers.
        final List<Integer> columnIds = Lists.newArrayList();
        if (isStarQuery()) {
            selectedColumnNames = tableColumnNames;
            for (int i = 0; i < selectedColumnNames.size(); i++) {
                columnIds.add(i);
            }
            selectedPartitionNames = partitionNames;
        } else {
            selectedColumnNames = Lists.newArrayList();
            for (SchemaPath field : getColumns()) {
                String columnName = field.getRootSegment().getPath();
                if (partitionNames.contains(columnName)) {
                    selectedPartitionNames.add(columnName);
                } else {
                    columnIds.add(tableColumnNames.indexOf(columnName));
                    selectedColumnNames.add(columnName);
                }
            }
        }
        ColumnProjectionUtils.appendReadColumns(job, columnIds, selectedColumnNames);

        for (String columnName : selectedColumnNames) {
            StructField fieldRef = finalOI.getStructFieldRef(columnName);
            selectedStructFieldRefs.add(fieldRef);
            ObjectInspector fieldOI = fieldRef.getFieldObjectInspector();

            TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(fieldOI.getTypeName());

            selectedColumnObjInspectors.add(fieldOI);
            selectedColumnTypes.add(typeInfo);
            selectedColumnFieldConverters.add(HiveFieldConverter.create(typeInfo, fragmentContext));
        }

        for (int i = 0; i < selectedColumnNames.size(); ++i) {
            logger.trace("inspector:typeName={}, className={}, TypeInfo: {}, converter:{}",
                    selectedColumnObjInspectors.get(i).getTypeName(),
                    selectedColumnObjInspectors.get(i).getClass().getName(),
                    selectedColumnTypes.get(i).toString(),
                    selectedColumnFieldConverters.get(i).getClass().getName());
        }

        for (int i = 0; i < table.getPartitionKeys().size(); i++) {
            FieldSchema field = table.getPartitionKeys().get(i);
            if (selectedPartitionNames.contains(field.getName())) {
                TypeInfo pType = TypeInfoUtils.getTypeInfoFromTypeString(field.getType());
                selectedPartitionTypes.add(pType);

                if (partition != null) {
                    selectedPartitionValues.add(HiveUtilities.convertPartitionType(pType,
                            partition.getValues().get(i), defaultPartitionValue));
                }
            }
        }
    } catch (Exception e) {
        throw new ExecutionSetupException("Failure while initializing Hive Reader " + this.getClass().getName(),
                e);
    }

    if (!empty) {
        try {
            reader = (org.apache.hadoop.mapred.RecordReader<Object, Object>) job.getInputFormat()
                    .getRecordReader(inputSplit, job, Reporter.NULL);
            logger.trace("hive reader created: {} for inputSplit {}", reader.getClass().getName(),
                    inputSplit.toString());
        } catch (Exception e) {
            throw new ExecutionSetupException(
                    "Failed to get o.a.hadoop.mapred.RecordReader from Hive InputFormat", e);
        }

        internalInit(tableProperties, reader);
    }
}