List of usage examples for org.apache.hadoop.mapred JobConf getInputFormat
public InputFormat getInputFormat()
From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java
License:Apache License
private InputSplit[] getOldInputSplits(JobConf conf) throws IOException { InputFormat inputFormat = conf.getInputFormat(); return inputFormat.getSplits(conf, conf.getNumMapTasks()); }
From source file:edu.uci.ics.hyracks.hdfs.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final InputSplit[] inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); @SuppressWarnings("unchecked") @Override/*from w w w . ja v a 2s. com*/ public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); JobConf conf = confFactory.getConf(); conf.setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); writer.open(); parser.open(writer); InputFormat inputFormat = conf.getInputFormat(); for (int i = 0; i < inputSplits.length; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read * synchronize among simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ RecordReader reader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL); Object key = reader.createKey(); Object value = reader.createValue(); while (reader.next(key, value) == true) { parser.parse(key, value, writer, inputSplits[i].toString()); } } } parser.close(writer); writer.close(); } catch (Exception e) { throw new HyracksDataException(e); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } } }; }
From source file:edu.uci.ics.hyracks.imru.dataflow.Hdtest.java
License:Apache License
public static JobSpecification createJob() throws Exception { JobSpecification spec = new JobSpecification(); spec.setFrameSize(4096);//from ww w . j av a 2s . co m String PATH_TO_HADOOP_CONF = "/home/wangrui/a/imru/hadoop-0.20.2/conf"; String HDFS_INPUT_PATH = "/customer/customer.tbl,/customer_result/part-0"; JobConf conf = new JobConf(); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/core-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/mapred-site.xml")); conf.addResource(new Path(PATH_TO_HADOOP_CONF + "/hdfs-site.xml")); FileInputFormat.setInputPaths(conf, HDFS_INPUT_PATH); conf.setInputFormat(TextInputFormat.class); RecordDescriptor recordDesc = new RecordDescriptor( new ISerializerDeserializer[] { new UTF8StringSerializerDeserializer() }); InputSplit[] splits = conf.getInputFormat().getSplits(conf, 1); HDFSReadOperatorDescriptor readOperator = new HDFSReadOperatorDescriptor(spec, recordDesc, conf, splits, new String[] { "NC0", "NC1" }, new IKeyValueParserFactory<LongWritable, Text>() { @Override public IKeyValueParser<LongWritable, Text> createKeyValueParser(final IHyracksTaskContext ctx) { return new IKeyValueParser<LongWritable, Text>() { TupleWriter tupleWriter; @Override public void open(IFrameWriter writer) throws HyracksDataException { tupleWriter = new TupleWriter(ctx, writer, 1); } @Override public void parse(LongWritable key, Text value, IFrameWriter writer, String fileString) throws HyracksDataException { try { tupleWriter.write(value.getBytes(), 0, value.getLength()); tupleWriter.finishField(); tupleWriter.finishTuple(); } catch (IOException e) { throw new HyracksDataException(e); } } @Override public void close(IFrameWriter writer) throws HyracksDataException { tupleWriter.close(); } }; } }); // createPartitionConstraint(spec, readOperator, new String[] {"NC0"}); PartitionConstraintHelper.addAbsoluteLocationConstraint(spec, readOperator, new String[] { "NC0", "NC1" }); IOperatorDescriptor writer = new HDFSOD(spec, null, null, null); // createPartitionConstraint(spec, writer, outSplits); spec.connect(new OneToOneConnectorDescriptor(spec), readOperator, 0, writer, 0); spec.addRoot(writer); return spec; }
From source file:edu.uci.ics.hyracks.imru.jobgen.IMRUJobFactory.java
License:Apache License
public InputSplit[] getInputSplits() throws IOException { JobConf conf = getConf(); FileInputFormat.setInputPaths(conf, inputPaths); conf.setInputFormat(HDFSBlockFormat.class); return conf.getInputFormat().getSplits(conf, 1); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
private static void setupPipesJob(JobConf conf) throws IOException { // default map output types to Text if (!getIsJavaMapper(conf)) { conf.setMapRunnerClass(PipesMapRunner.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, conf.getPartitionerClass()); conf.setPartitionerClass(PipesPartitioner.class); }/*from w w w .j ava 2s . co m*/ if (!getIsJavaReducer(conf)) { conf.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { conf.setOutputFormat(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class); conf.setInputFormat(PipesNonJavaInputFormat.class); } String exec = getExecutable(conf); if (exec == null) { throw new IllegalArgumentException("No application program defined."); } // add default debug script only when executable is expressed as // <path>#<executable> if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length + 1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { IOException ie = new IOException("Problem parsing execable URI " + exec); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
From source file:org.apache.asterix.external.input.HDFSDataSourceFactory.java
License:Apache License
@Override public void configure(Map<String, String> configuration) throws AsterixException { try {/*from w w w.j a v a 2 s .co m*/ init(); this.configuration = configuration; JobConf conf = HDFSUtils.configureHDFSJobConf(configuration); confFactory = new ConfFactory(conf); clusterLocations = getPartitionConstraint(); int numPartitions = clusterLocations.getLocations().length; // if files list was set, we restrict the splits to the list InputSplit[] inputSplits; if (files == null) { inputSplits = conf.getInputFormat().getSplits(conf, numPartitions); } else { inputSplits = HDFSUtils.getSplits(conf, files); } if (indexingOp) { readSchedule = indexingScheduler.getLocationConstraints(inputSplits); } else { readSchedule = hdfsScheduler.getLocationConstraints(inputSplits); } inputSplitsFactory = new InputSplitsFactory(inputSplits); read = new boolean[readSchedule.length]; Arrays.fill(read, false); String formatString = configuration.get(ExternalDataConstants.KEY_FORMAT); if (formatString == null || formatString.equals(ExternalDataConstants.FORMAT_HDFS_WRITABLE)) { RecordReader<?, ?> reader = conf.getInputFormat().getRecordReader(inputSplits[0], conf, Reporter.NULL); this.recordClass = reader.createValue().getClass(); reader.close(); } else { format = StreamRecordReaderProvider.getReaderFormat(configuration); this.recordClass = char[].class; } } catch (IOException e) { throw new AsterixException(e); } }
From source file:org.apache.asterix.external.input.record.reader.hdfs.HDFSRecordReader.java
License:Apache License
public HDFSRecordReader(boolean read[], InputSplit[] inputSplits, String[] readSchedule, String nodeName, JobConf conf, List<ExternalFile> snapshot, IExternalIndexer indexer) throws IOException { this.read = read; this.inputSplits = inputSplits; this.readSchedule = readSchedule; this.nodeName = nodeName; this.conf = conf; this.inputFormat = conf.getInputFormat(); this.reader = new EmptyRecordReader<K, Writable>(); this.record = new GenericRecord<Writable>(); this.indexer = indexer; this.snapshot = snapshot; this.hdfs = FileSystem.get(conf); nextInputSplit();/*from ww w. j a v a 2 s . c o m*/ }
From source file:org.apache.asterix.external.input.stream.HDFSInputStream.java
License:Apache License
@SuppressWarnings("unchecked") public HDFSInputStream(boolean read[], InputSplit[] inputSplits, String[] readSchedule, String nodeName, JobConf conf, Map<String, String> configuration, List<ExternalFile> snapshot, IExternalIndexer indexer) throws IOException, AsterixException { this.read = read; this.inputSplits = inputSplits; this.readSchedule = readSchedule; this.nodeName = nodeName; this.conf = conf; this.inputFormat = conf.getInputFormat(); this.reader = new EmptyRecordReader<Object, Text>(); this.snapshot = snapshot; this.hdfs = FileSystem.get(conf); this.indexer = indexer; nextInputSplit();//from www .j av a 2 s .c o m this.value = new Text(); if (snapshot != null) { if (currentSplitIndex < snapshot.size()) { indexer.reset(this); } } }
From source file:org.apache.asterix.test.runtime.HDFSCluster.java
License:Apache License
public static void main(String[] args) throws Exception { HDFSCluster cluster = new HDFSCluster(); cluster.setup();/*from ww w. ja v a 2 s. c om*/ JobConf conf = configureJobConf(); InputSplit[] inputSplits = conf.getInputFormat().getSplits(conf, 0); for (InputSplit split : inputSplits) { System.out.println("split :" + split); } }
From source file:org.apache.drill.exec.store.hive.HiveAbstractReader.java
License:Apache License
private void init() throws ExecutionSetupException { final JobConf job = new JobConf(hiveConf); // Get the configured default val defaultPartitionValue = hiveConf.get(ConfVars.DEFAULTPARTITIONNAME.varname); Properties tableProperties;// w ww . ja v a 2s .c om try { tableProperties = HiveUtilities.getTableMetadata(table); final Properties partitionProperties = (partition == null) ? tableProperties : HiveUtilities.getPartitionMetadata(partition, table); HiveUtilities.addConfToJob(job, partitionProperties); final SerDe tableSerDe = createSerDe(job, table.getSd().getSerdeInfo().getSerializationLib(), tableProperties); final StructObjectInspector tableOI = getStructOI(tableSerDe); if (partition != null) { partitionSerDe = createSerDe(job, partition.getSd().getSerdeInfo().getSerializationLib(), partitionProperties); partitionOI = getStructOI(partitionSerDe); finalOI = (StructObjectInspector) ObjectInspectorConverters.getConvertedOI(partitionOI, tableOI); partTblObjectInspectorConverter = ObjectInspectorConverters.getConverter(partitionOI, finalOI); job.setInputFormat(HiveUtilities.getInputFormatClass(job, partition.getSd(), table)); } else { // For non-partitioned tables, there is no need to create converter as there are no schema changes expected. partitionSerDe = tableSerDe; partitionOI = tableOI; partTblObjectInspectorConverter = null; finalOI = tableOI; job.setInputFormat(HiveUtilities.getInputFormatClass(job, table.getSd(), table)); } if (logger.isTraceEnabled()) { for (StructField field : finalOI.getAllStructFieldRefs()) { logger.trace("field in finalOI: {}", field.getClass().getName()); } logger.trace("partitionSerDe class is {} {}", partitionSerDe.getClass().getName()); } // Get list of partition column names final List<String> partitionNames = Lists.newArrayList(); for (FieldSchema field : table.getPartitionKeys()) { partitionNames.add(field.getName()); } // We should always get the columns names from ObjectInspector. For some of the tables (ex. avro) metastore // may not contain the schema, instead it is derived from other sources such as table properties or external file. // SerDe object knows how to get the schema with all the config and table properties passed in initialization. // ObjectInspector created from the SerDe object has the schema. final StructTypeInfo sTypeInfo = (StructTypeInfo) TypeInfoUtils.getTypeInfoFromObjectInspector(finalOI); final List<String> tableColumnNames = sTypeInfo.getAllStructFieldNames(); // Select list of columns for project pushdown into Hive SerDe readers. final List<Integer> columnIds = Lists.newArrayList(); if (isStarQuery()) { selectedColumnNames = tableColumnNames; for (int i = 0; i < selectedColumnNames.size(); i++) { columnIds.add(i); } selectedPartitionNames = partitionNames; } else { selectedColumnNames = Lists.newArrayList(); for (SchemaPath field : getColumns()) { String columnName = field.getRootSegment().getPath(); if (partitionNames.contains(columnName)) { selectedPartitionNames.add(columnName); } else { columnIds.add(tableColumnNames.indexOf(columnName)); selectedColumnNames.add(columnName); } } } ColumnProjectionUtils.appendReadColumns(job, columnIds, selectedColumnNames); for (String columnName : selectedColumnNames) { StructField fieldRef = finalOI.getStructFieldRef(columnName); selectedStructFieldRefs.add(fieldRef); ObjectInspector fieldOI = fieldRef.getFieldObjectInspector(); TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(fieldOI.getTypeName()); selectedColumnObjInspectors.add(fieldOI); selectedColumnTypes.add(typeInfo); selectedColumnFieldConverters.add(HiveFieldConverter.create(typeInfo, fragmentContext)); } for (int i = 0; i < selectedColumnNames.size(); ++i) { logger.trace("inspector:typeName={}, className={}, TypeInfo: {}, converter:{}", selectedColumnObjInspectors.get(i).getTypeName(), selectedColumnObjInspectors.get(i).getClass().getName(), selectedColumnTypes.get(i).toString(), selectedColumnFieldConverters.get(i).getClass().getName()); } for (int i = 0; i < table.getPartitionKeys().size(); i++) { FieldSchema field = table.getPartitionKeys().get(i); if (selectedPartitionNames.contains(field.getName())) { TypeInfo pType = TypeInfoUtils.getTypeInfoFromTypeString(field.getType()); selectedPartitionTypes.add(pType); if (partition != null) { selectedPartitionValues.add(HiveUtilities.convertPartitionType(pType, partition.getValues().get(i), defaultPartitionValue)); } } } } catch (Exception e) { throw new ExecutionSetupException("Failure while initializing Hive Reader " + this.getClass().getName(), e); } if (!empty) { try { reader = (org.apache.hadoop.mapred.RecordReader<Object, Object>) job.getInputFormat() .getRecordReader(inputSplit, job, Reporter.NULL); logger.trace("hive reader created: {} for inputSplit {}", reader.getClass().getName(), inputSplit.toString()); } catch (Exception e) { throw new ExecutionSetupException( "Failed to get o.a.hadoop.mapred.RecordReader from Hive InputFormat", e); } internalInit(tableProperties, reader); } }