List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name, String defaultValue)
name
. From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java
License:Apache License
/** * Get the list of input {@link Path}s for the map-reduce job. * /*from w w w.jav a 2 s . c o m*/ * @param conf The configuration of the job * @return the list of input {@link Path}s for the map-reduce job. */ public static Path[] getInputPaths(JobConf conf) { String dirs = conf.get("mapred.input.dir", ""); String[] list = StringUtils.split(dirs); Path[] result = new Path[list.length]; for (int i = 0; i < list.length; i++) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } return result; }
From source file:com.blackberry.logdriver.mapred.BinaryRecordWriter.java
License:Apache License
public BinaryRecordWriter(JobConf job) { String extension = job.get("output.file.extension", ""); String taskid = job.get("mapred.task.id"); try {/*w ww. ja v a2s . c o m*/ Path outputPath = BinaryOutputFormat.getTaskOutputPath(job, taskid + extension); FileSystem fs = FileSystem.get(job); LOG.info("Creating output path: {}", outputPath); out = fs.create(outputPath, true); } catch (IOException e) { LOG.error("Error creating output file.", e); } }
From source file:com.cloudera.hive.scd.SQLUpdater.java
License:Open Source License
private List<String> loadUpdateStatements(InputSplit split, JobConf jc) throws IOException { long currentSCDTime = asSCDTime(jc.get("scd.time", ""), System.currentTimeMillis()); List<String> stmts = Lists.newArrayList(); if (split instanceof FileSplit) { Path base = ((FileSplit) split).getPath(); FileSystem fs = base.getFileSystem(jc); Path updates = new Path(base.getParent(), ".updates"); if (fs.exists(updates)) { stmts.addAll(readLines(fs, updates, currentSCDTime)); }// ww w . ja va2 s.co m } return stmts; }
From source file:com.davidgildeh.hadoop.input.simpledb.SimpleDBInputFormat.java
License:Apache License
/** * Main method to generate splits from SimpleDB. Takes a start and end date range * and generates split periods to filter SimpleDB based on split period given in * configuration// w w w . jav a 2s . co m * * @param jobConf The Map Task Job Configuration * @param numSplits Hint to calculate the number of splits * @return The Input Splits * @throws IOException */ public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { // Get the splitsize (number of rows), defaults to 100,000 as much larger seems to // screw up getting accurate rows int splitSize = Integer .parseInt(jobConf.get(SimpleDBInputFormat.SIMPLEDB_SPLIT_SIZE, String.valueOf(MAX_SPLIT_SIZE))); if (splitSize > MAX_SPLIT_SIZE) { splitSize = MAX_SPLIT_SIZE; } // Get total number of rows to calculate number of splits required SimpleDBDAO sdb = new SimpleDBDAO(jobConf); long totalItems; if (sdb.getWhereQuery() != null) { totalItems = sdb.getCount(); } else { totalItems = sdb.getTotalItemCount(); } long totalSplits = 1; if (splitSize < totalItems) { totalSplits = totalItems / splitSize; } if (LOG.isDebugEnabled()) { LOG.debug("Total Rows:" + String.valueOf(totalItems)); LOG.debug("Total Splits:" + String.valueOf(totalSplits)); } // Array to hold splits ArrayList<SimpleDBInputSplit> splits = new ArrayList<SimpleDBInputSplit>(); // Create Splits for (int i = 0; i < totalSplits; i++) { String splitToken = sdb.getSplitToken(i, splitSize); long startRow = i * splitSize; long endRow = startRow + splitSize; if (endRow > totalItems) { endRow = totalItems; } SimpleDBInputSplit split = new SimpleDBInputSplit(startRow, endRow, splitToken); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug("Created Split: " + split.toString()); } } // Return array of splits return splits.toArray(new SimpleDBInputSplit[splits.size()]); }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); filter = DocumentFilter.getFilters(job); lowerCase = job.getBoolean("classification.tokenize", false); docFeaturename = job.get("classification.doc.feature.name", "label"); String modelPath = job.get(ClassifierJob.modelNameParam); // optimisation for jvm reuse // do not reload the model if (classifier != null) { LOG.info("Reusing existing classifier [" + classifier.toString() + "]"); return;//from ww w . j a v a2s. c o m } long start = System.currentTimeMillis(); File modelFile = null; try { String modelCacheName = new Path(modelPath).getName(); Path[] cacheFiles = DistributedCache.getLocalCacheArchives(job); if (null != cacheFiles && cacheFiles.length > 0) { for (Path cachePath : cacheFiles) { LOG.info("LocalCache : " + cachePath.toUri()); LOG.info("modelCacheName : " + modelCacheName); if (cachePath.toUri().toString().endsWith(modelCacheName)) { String parent = new File(cachePath.toUri().getPath()).toString(); modelFile = new File(parent, modelCacheName.replaceAll(".zip", "")); LOG.info("Unzipped ? " + modelFile.getAbsolutePath()); boolean doesExist = modelFile.exists(); LOG.info("modelFile exists " + doesExist); // if it does not exist it must have been unpacked at // the parent level if (!doesExist) { modelFile = new File(parent); } break; } } } } catch (IOException ioe) { throw new RuntimeException("Impossible to retrieve model from distributed cache", ioe); } try { classifier = classifier.getClassifier(modelFile); } catch (Exception e) { throw new RuntimeException("Impossible to load model from " + modelFile, e); } long end = System.currentTimeMillis(); LOG.info("Model loaded in " + (end - start) + " msec"); }
From source file:com.ebay.erl.mobius.core.builder.TSVDatasetBuilder.java
License:Apache License
/** * {@inheritDoc}//from ww w . ja va2s .c om */ @Override public Dataset buildFromPreviousJob(JobConf prevJob, Class<? extends FileOutputFormat> prevJobOutputFormat, String[] schema) throws IOException { if (prevJobOutputFormat.equals(TextOutputFormat.class)) { // no need to validate the input path as it's coming from // previous dataset this.addInputPath(false, FileOutputFormat.getOutputPath(prevJob)); this.setSchema(schema); this.setDelimiter(prevJob.get(ConfigureConstants.TUPLE_TO_STRING_DELIMITER, "\t")); return this.build(); } else { throw new IllegalArgumentException(this.getClass().getCanonicalName() + " cannot build dataset from " + prevJobOutputFormat.getCanonicalName() + ", only " + TextOutputFormat.class.getCanonicalName() + " is supported."); } }
From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java
License:Apache License
/** * Get the path to the SequenceFile storing the sorted partition keyset. * @see #setPartitionFile(JobConf,Path)//from w w w.ja va2 s . c o m */ public static String getPartitionFile(JobConf job) { return job.get("total.order.partitioner.path", DEFAULT_PATH); }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
@Override protected synchronized void submit() { JobConf jobConf = this.getJobConf(); boolean isLocalHadoop = jobConf.get("mapred.job.tracker", "local").equals("local"); // the default partitioner is {@link com.ebay.erl.mobius.core.datajoin.DataJoinKeyPartitioner} // which is hash based. ///* w w w . ja va2 s .com*/ // If user choose to use even partitioner, Mobius will use // {@link com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner} which // is sampling based partitioner of attempting to balance the load // for each reducer. String partitioner = jobConf.get("mobius.partitioner", "default"); if (!isLocalHadoop && jobConf.getNumReduceTasks() != 0 && partitioner.equals("even")) { // this job needs reducer, perform sampling on the keys to // make load on reducers are almost evenly distributed. double freq = jobConf.getFloat("mobius.sampler.freq", 0.1F); int numSamples = jobConf.getInt("mobius.sampler.num.samples", 50000); int maxSplits = jobConf.getInt("mobius.sampler.max.slipts.sampled", 5); // log sampling parameters so that user knows. LOGGER.info("Sampling parameters { " + "mobius.sampler.freq:" + format.format(freq) + ", " + "mobius.sampler.num.samples:" + numSamples + ", " + "mobius.sampler.max.slipts.sampled:" + maxSplits + "}"); InputSampler.Sampler<?, ?> sampler = new MobiusInputSampler(freq, numSamples, maxSplits); writePartitionFile(jobConf, sampler); // add to distributed cache try { URI partitionUri = new URI(TotalOrderPartitioner.getPartitionFile(jobConf) + "#_partitions"); LOGGER.info("Adding partition uri to distributed cache:" + partitionUri.toString()); DistributedCache.addCacheFile(partitionUri, jobConf); DistributedCache.createSymlink(jobConf); jobConf.setPartitionerClass(EvenlyPartitioner.class); LOGGER.info("Using " + EvenlyPartitioner.class.getCanonicalName() + " to partiton the keys evenly among reducers."); } catch (URISyntaxException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } // adding -XX:-UseParallelOldGC, this will automatically set -XX:-UseParallelGC // according to Oracle's specification String jvmOpts = jobConf.get("mapred.child.java.opts", ""); if (jvmOpts.isEmpty()) { jvmOpts = "-XX:-UseParallelOldGC"; } else { if (jvmOpts.indexOf("-XX:-UseParallelOldGC") < 0) { // remove " jvmOpts = jvmOpts.replaceAll("\"", ""); jvmOpts = jvmOpts.concat(" -XX:-UseParallelOldGC"); } } jobConf.set("mapred.child.java.opts", jvmOpts); this.setJobConf(jobConf); } LOGGER.info("Submiting job:" + jobConf.getJobName()); super.submit(); }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
private static void writePartitionFile(JobConf job, Sampler sampler) { try {// www . ja va 2s .co m //////////////////////////////////////////////// // first, getting samples from the data sources //////////////////////////////////////////////// LOGGER.info("Running local sampling for job [" + job.getJobName() + "]"); InputFormat inf = job.getInputFormat(); Object[] samples = sampler.getSample(inf, job); LOGGER.info("Samples retrieved, sorting..."); //////////////////////////////////////////////// // sort the samples //////////////////////////////////////////////// RawComparator comparator = job.getOutputKeyComparator(); Arrays.sort(samples, comparator); if (job.getBoolean("mobius.print.sample", false)) { PrintWriter pw = new PrintWriter( new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream( new File(job.get("mobius.sample.file", "./samples.txt.gz"))))))); for (Object obj : samples) { pw.println(obj); } pw.flush(); pw.close(); } //////////////////////////////////////////////// // start to write partition files //////////////////////////////////////////////// FileSystem fs = FileSystem.get(job); Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job))); while (fs.exists(partitionFile)) { partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis()); } fs.deleteOnExit(partitionFile); TotalOrderPartitioner.setPartitionFile(job, partitionFile); LOGGER.info("write partition file to:" + partitionFile.toString()); int reducersNbr = job.getNumReduceTasks(); Set<Object> wroteSamples = new HashSet<Object>(); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class, NullWritable.class); float avgReduceSize = samples.length / reducersNbr; int lastBegin = 0; for (int i = 0; i < samples.length;) { // trying to distribute the load for every reducer evenly, // dividing the <code>samples</code> into a set of blocks // separated by boundaries, objects that selected from the // <code>samples</code> array, and each blocks should have // about the same size. // find the last index of element that equals to samples[i], as // such element might appear multiple times in the samples. int upperBound = Util.findUpperBound(samples, samples[i], comparator); int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator); // the repeat time of samples[i], if the key itself is too big // select it as boundary int currentElemSize = upperBound - lowerBound + 1; if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size { // the current element is too big, greater than // two times of the <code>avgReduceSize</code>, // put itself as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); // immediate put the next element to the boundary, // the next element starts at <code> upperBound+1 // </code>, to prevent the current one consume even // more. if (upperBound + 1 < samples.length) { writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey()); //pw.println(samples[upperBound+1]); // move on to the next element of <code>samples[upperBound+1]/code> lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1; i = lastBegin; } else { break; } } else { // current element is small enough to be consider // with previous group int size = upperBound - lastBegin; if (size > avgReduceSize) { // by including the current elements, we have // found a block that's big enough, select it // as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); i = upperBound + 1; lastBegin = i; } else { i = upperBound + 1; } } } writer.close(); // if the number of wrote samples doesn't equals to number of // reducer minus one, then it means the key spaces is too small // hence TotalOrderPartitioner won't work, it works only if // the partition boundaries are distinct. // // we need to change the number of reducers if (wroteSamples.size() + 1 != reducersNbr) { LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size() + ", reducer size:" + (reducersNbr)); LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1)); // add 1 because the wrote samples define boundary, ex, if // the sample size is two with two element [300, 1000], then // there should be 3 reducers, one for handling i<300, one // for n300<=i<1000, and another one for 1000<=i job.setNumReduceTasks((wroteSamples.size() + 1)); } samples = null; } catch (IOException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } }
From source file:com.ebay.erl.mobius.core.mapred.MobiusInputSampler.java
License:Apache License
@Override public Object[] getSample(InputFormat inf, JobConf job) throws IOException { // the following codes are copied from {@link InputSampler#RandomSampler}, // but require some modifications. InputSplit[] splits = inf.getSplits(job, job.getNumMapTasks()); ArrayList<DataJoinKey> samples = new ArrayList<DataJoinKey>(this.numSamples); int splitsToSample = Math.min(this.maxSplitsSampled, splits.length); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed);/*from w ww. j a va2 s . c om*/ // get Sorters Sorter[] sorters = null; if (job.get(ConfigureConstants.SORTERS, null) != null) { // total sort job sorters = (Sorter[]) SerializableUtil.deserializeFromBase64(job.get(ConfigureConstants.SORTERS), job); } else { // there is no sorter, should be reducer/join job Column[] keys = (Column[]) SerializableUtil .deserializeFromBase64(job.get(ConfigureConstants.ALL_GROUP_KEY_COLUMNS), job); sorters = new Sorter[keys.length]; for (int i = 0; i < keys.length; i++) { sorters[i] = new Sorter(keys[i].getInputColumnName(), Ordering.ASC); } } long proportion = 10L; while ((int) (this.freq * proportion) == 0) { proportion = proportion * 10; } proportion = 5L * proportion; // shuffle splits for (int i = 0; i < splits.length; ++i) { InputSplit tmp = splits[i]; int j = r.nextInt(splits.length); splits[i] = splits[j]; splits[j] = tmp; } SamplingOutputCollector collector = new SamplingOutputCollector(); for (int i = 0; i < splitsToSample || (i < splits.length && samples.size() < numSamples); i++) { LOGGER.info("Sampling from split #" + (i + 1) + ", collected samples:" + samples.size()); RecordReader<WritableComparable, WritableComparable> reader = inf.getRecordReader(splits[i], job, Reporter.NULL); WritableComparable key = reader.createKey(); WritableComparable value = reader.createValue(); if (!(inf instanceof MobiusDelegatingInputFormat)) { // not mobius delegating input format, so the CURRENT_DATASET_ID // will not be set by inf#getRecordReader, we set them here. // // set the current dataset id, as the AbstractMobiusMapper#configure // method needs this property. job.set(ConfigureConstants.CURRENT_DATASET_ID, job.get(ConfigureConstants.ALL_DATASET_IDS)); } Byte datasetID = Byte.valueOf(job.get(ConfigureConstants.CURRENT_DATASET_ID)); LOGGER.info("Samples coming from dataset: " + datasetID.toString()); AbstractMobiusMapper mapper = this.getMapper(inf, splits[i], job); mapper.configure(job); // reading elements from one split long readElement = 0; while (reader.next(key, value)) { collector.clear(); Tuple tuple = mapper.parse(key, value); readElement++; if (readElement > (((long) numSamples) * ((long) proportion))) { // a split might be very big (ex: a large gz file), // so we just need to read the break; } if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { mapper.joinmap(key, value, collector, Reporter.NULL); // joinmap function might generate more than one output key // per <code>key</code> input. for (Tuple t : collector.getOutKey()) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.add(nkey); } } else { // When exceeding the maximum number of samples, replace // a random element with this one, then adjust the // frequency to reflect the possibility of existing // elements being pushed out mapper.joinmap(key, value, collector, Reporter.NULL); for (Tuple t : collector.getOutKey()) { int ind = r.nextInt(numSamples); if (ind != numSamples) { Tuple mt = Tuple.merge(tuple, t); DataJoinKey nkey = this.getKey(mt, sorters, datasetID, mapper, job); samples.set(ind, nkey); } } freq *= (numSamples - collector.getOutKey().size()) / (double) numSamples; } key = reader.createKey(); value = reader.createValue(); } } reader.close(); } LOGGER.info("Samples have been collected, return."); return samples.toArray(); }