List of usage examples for org.apache.hadoop.mapred JobConf getInt
public int getInt(String name, int defaultValue)
name
property as an int
. From source file:org.apache.flink.batch.connectors.hive.HiveTableOutputFormat.java
License:Apache License
private HivePartitionWriter writerForLocation(String location) throws IOException { JobConf clonedConf = new JobConf(jobConf); clonedConf.set(OUTDIR, location);/*from ww w . j a v a 2 s.c o m*/ OutputFormat outputFormat; try { StorageDescriptor sd = hiveTablePartition.getStorageDescriptor(); Class outputFormatClz = Class.forName(sd.getOutputFormat(), true, Thread.currentThread().getContextClassLoader()); outputFormatClz = HiveFileFormatUtils.getOutputFormatSubstitute(outputFormatClz); outputFormat = (OutputFormat) outputFormatClz.newInstance(); } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { throw new FlinkRuntimeException("Unable to instantiate the hadoop output format", e); } ReflectionUtils.setConf(outputFormat, clonedConf); OutputCommitter outputCommitter = clonedConf.getOutputCommitter(); JobContext jobContext = new JobContextImpl(clonedConf, new JobID()); outputCommitter.setupJob(jobContext); final boolean isCompressed = clonedConf.getBoolean(HiveConf.ConfVars.COMPRESSRESULT.varname, false); if (isCompressed) { String codecStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATECODEC.varname); if (!StringUtils.isNullOrWhitespaceOnly(codecStr)) { try { Class<? extends CompressionCodec> codec = (Class<? extends CompressionCodec>) Class .forName(codecStr, true, Thread.currentThread().getContextClassLoader()); FileOutputFormat.setOutputCompressorClass(clonedConf, codec); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } } String typeStr = clonedConf.get(HiveConf.ConfVars.COMPRESSINTERMEDIATETYPE.varname); if (!StringUtils.isNullOrWhitespaceOnly(typeStr)) { SequenceFile.CompressionType style = SequenceFile.CompressionType.valueOf(typeStr); SequenceFileOutputFormat.setOutputCompressionType(clonedConf, style); } } String taskPartition = String.valueOf(clonedConf.getInt("mapreduce.task.partition", -1)); Path taskPath = FileOutputFormat.getTaskOutputPath(clonedConf, taskPartition); FileSinkOperator.RecordWriter recordWriter; try { recordWriter = HiveFileFormatUtils.getRecordWriter(clonedConf, outputFormat, outputClass, isCompressed, tblProperties, taskPath, Reporter.NULL); } catch (HiveException e) { throw new IOException(e); } return new HivePartitionWriter(clonedConf, outputFormat, recordWriter, outputCommitter); }
From source file:org.apache.hawq.pxf.plugins.json.JsonRecordReader.java
License:Apache License
/** * Create new multi-line json object reader. * /*w w w. ja v a 2 s. com*/ * @param conf * Hadoop context * @param split * HDFS split to start the reading from * @throws IOException IOException when reading the file */ public JsonRecordReader(JobConf conf, FileSplit split) throws IOException { this.jsonMemberName = conf.get(RECORD_MEMBER_IDENTIFIER); this.maxObjectLength = conf.getInt(RECORD_MAX_LENGTH, Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { is = codec.createInputStream(fileIn); start = 0; end = Long.MAX_VALUE; } else { if (start != 0) { fileIn.seek(start); } is = fileIn; } parser = new PartitionedJsonParser(is); this.pos = start; }
From source file:org.apache.mahout.cf.taste.hadoop.similarity.item.SimilarityReducer.java
License:Apache License
@Override public void configure(JobConf jobConf) { super.configure(jobConf); distributedItemSimilarity = ItemSimilarityJob .instantiateSimilarity(jobConf.get(ItemSimilarityJob.DISTRIBUTED_SIMILARITY_CLASSNAME)); numberOfUsers = jobConf.getInt(ItemSimilarityJob.NUMBER_OF_USERS, -1); if (numberOfUsers <= 0) { throw new IllegalStateException("Number of users was not set correctly"); }/* w w w. jav a2s .com*/ }
From source file:org.apache.mahout.df.mapred.partial.Step1Mapper.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); configure(Builder.getRandomSeed(job), job.getInt("mapred.task.partition", -1), job.getNumMapTasks(), Builder.getNbTrees(job));/*from www . j a v a 2 s.c om*/ }
From source file:org.apache.mahout.df.mapred.partial.Step2Mapper.java
License:Apache License
@Override public void configure(JobConf job) { // get the cached files' paths URI[] files;/*from w ww . java 2 s . c o m*/ try { files = DistributedCache.getCacheFiles(job); } catch (IOException e) { throw new IllegalStateException("Exception while getting the cache files : ", e); } if ((files == null) || (files.length < 2)) { throw new IllegalArgumentException("missing paths from the DistributedCache"); } Dataset dataset; try { Path datasetPath = new Path(files[0].getPath()); dataset = Dataset.load(job, datasetPath); } catch (IOException e) { throw new IllegalStateException("Exception while loading the dataset : ", e); } int numMaps = job.getNumMapTasks(); int p = job.getInt("mapred.task.partition", -1); // total number of trees in the forest int numTrees = Builder.getNbTrees(job); if (numTrees == -1) { throw new IllegalArgumentException("numTrees not found !"); } int nbConcerned = nbConcerned(numMaps, numTrees, p); keys = new TreeID[nbConcerned]; trees = new Node[nbConcerned]; int numInstances; try { Path forestPath = new Path(files[1].getPath()); FileSystem fs = forestPath.getFileSystem(job); numInstances = InterResults.load(fs, forestPath, numMaps, numTrees, p, keys, trees); log.debug("partition: {} numInstances: {}", p, numInstances); } catch (IOException e) { throw new IllegalStateException("Exception while loading the forest : ", e); } configure(p, dataset, keys, trees, numInstances); }
From source file:org.apache.nutch.crawl.CrawlDbReducer.java
License:Apache License
public void configure(JobConf job) { retryMax = job.getInt("db.fetch.retry.max", 3); scfilters = new ScoringFilters(job); additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true); int oldMaxInterval = job.getInt("db.max.fetch.interval", 0); maxInterval = job.getInt("db.fetch.interval.max", 0); if (oldMaxInterval > 0 && maxInterval == 0) maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; schedule = FetchScheduleFactory.getFetchSchedule(job); int maxLinks = job.getInt("db.update.max.inlinks", 10000); linked = new InlinkPriorityQueue(maxLinks); }
From source file:org.apache.nutch.crawl.LinkDbMerger.java
License:Apache License
public void configure(JobConf job) { maxInlinks = job.getInt("db.max.inlinks", 10000); }
From source file:org.apache.nutch.crawl.PartitionUrlByHost.java
License:Apache License
public void configure(JobConf job) { seed = job.getInt("partition.url.by.host.seed", 0); normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION); }
From source file:org.apache.nutch.indexer.lucene.LuceneWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED); writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG)); writer.setUseCompoundFile(false);/*from ww w. j a v a 2 s .c o m*/ writer.setSimilarity(new NutchSimilarity()); processOptions(job); }
From source file:org.apache.nutch.indexwriter.lucene.LuceneWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, new SmartChineseAnalyzer()); LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); mergePolicy.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); indexWriterConfig.setMergePolicy(mergePolicy); indexWriterConfig.setUseCompoundFile(false); indexWriterConfig.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); indexWriterConfig.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writer = new org.apache.lucene.index.IndexWriter( FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), indexWriterConfig); /*//from w w w . j a va2 s . c o m * addFieldOptions("title", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("url", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("content", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("lang", STORE.YES, INDEX.UNTOKENIZED, VECTOR.NO, job); */ processOptions(job); }