List of usage examples for org.apache.hadoop.mapred JobConf getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:com.cloudera.recordservice.avro.AvroJob.java
License:Apache License
public static void setInputFormat(org.apache.hadoop.mapred.JobConf job, Class<? extends org.apache.hadoop.mapred.InputFormat> c) { if (job.getBoolean(USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, false)) { if (c.getName().equals(org.apache.avro.mapred.AvroInputFormat.class.getName())) { c = com.cloudera.recordservice.avro.mapred.AvroInputFormat.class; } else {//from w w w . java 2s. c om throw new RuntimeException("Class '" + c.getName() + "' is not supported " + "by the RecordService. Use AvroInputFormat or disable RecordService."); } } LOG.debug("Using input format: " + c.getName()); job.setInputFormat(c); }
From source file:com.cloudera.science.avro.streaming.AvroAsJSONOutputFormat.java
License:Open Source License
@Override public RecordWriter<Text, Text> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { if (schema == null) { SchemaLoader loader = new SchemaLoader(job); this.schema = loader.load(job.get(SCHEMA_LITERAL), job.get(SCHEMA_URL), job.get(SCHEMA_TYPE_NAME)); this.converter = new JsonConverter(schema); this.readKey = job.getBoolean(READ_KEY, true); }//from w w w . j a v a 2 s .c o m DataFileWriter<GenericRecord> writer = new DataFileWriter<GenericRecord>( new GenericDatumWriter<GenericRecord>(schema)); if (getCompressOutput(job)) { int level = job.getInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, AvroOutputFormat.DEFAULT_DEFLATE_LEVEL); String codecName = job.get(AvroJob.CONF_OUTPUT_CODEC, org.apache.avro.file.DataFileConstants.DEFLATE_CODEC); CodecFactory codec = codecName.equals(DataFileConstants.DEFLATE_CODEC) ? CodecFactory.deflateCodec(level) : CodecFactory.fromString(codecName); writer.setCodec(codec); } writer.setSyncInterval( job.getInt(AvroOutputFormat.SYNC_INTERVAL_KEY, DataFileConstants.DEFAULT_SYNC_INTERVAL)); Path path = FileOutputFormat.getTaskOutputPath(job, name + AvroOutputFormat.EXT); writer.create(schema, path.getFileSystem(job).create(path)); return new AvroAsJSONRecordWriter(writer, converter, readKey); }
From source file:com.datascience.hadoop.CsvInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, ListWritable<Text>> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { String charsetName = conf.get(CHARSET); Charset charset = charsetName != null ? Charset.forName(charsetName) : StandardCharsets.UTF_8; FileSplit split = (FileSplit) inputSplit; Path path = split.getPath();/* w w w . ja v a 2s. c om*/ FileSystem fs = path.getFileSystem(conf); InputStream is = fs.open(path); // If the input is compressed, load the compression codec. CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); CompressionCodec codec = codecFactory.getCodec(path); if (codec != null) { Decompressor decompressor = CodecPool.getDecompressor(codec); is = codec.createInputStream(is, decompressor); } return new CsvRecordReader(new InputStreamReader(is, charset), createFormat(conf), split.getLength(), conf.getBoolean(STRICT_MODE, true)); }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); filter = DocumentFilter.getFilters(job); lowerCase = job.getBoolean("classification.tokenize", false); docFeaturename = job.get("classification.doc.feature.name", "label"); String modelPath = job.get(ClassifierJob.modelNameParam); // optimisation for jvm reuse // do not reload the model if (classifier != null) { LOG.info("Reusing existing classifier [" + classifier.toString() + "]"); return;/* w w w . j av a 2s. c o m*/ } long start = System.currentTimeMillis(); File modelFile = null; try { String modelCacheName = new Path(modelPath).getName(); Path[] cacheFiles = DistributedCache.getLocalCacheArchives(job); if (null != cacheFiles && cacheFiles.length > 0) { for (Path cachePath : cacheFiles) { LOG.info("LocalCache : " + cachePath.toUri()); LOG.info("modelCacheName : " + modelCacheName); if (cachePath.toUri().toString().endsWith(modelCacheName)) { String parent = new File(cachePath.toUri().getPath()).toString(); modelFile = new File(parent, modelCacheName.replaceAll(".zip", "")); LOG.info("Unzipped ? " + modelFile.getAbsolutePath()); boolean doesExist = modelFile.exists(); LOG.info("modelFile exists " + doesExist); // if it does not exist it must have been unpacked at // the parent level if (!doesExist) { modelFile = new File(parent); } break; } } } } catch (IOException ioe) { throw new RuntimeException("Impossible to retrieve model from distributed cache", ioe); } try { classifier = classifier.getClassifier(modelFile); } catch (Exception e) { throw new RuntimeException("Impossible to load model from " + modelFile, e); } long end = System.currentTimeMillis(); LOG.info("Model loaded in " + (end - start) + " msec"); }
From source file:com.digitalpebble.behemoth.solr.LucidWorksWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { String zkHost = job.get("solr.zkhost"); if (zkHost != null && zkHost.equals("") == false) { String collection = job.get("solr.zk.collection", "collection1"); LOG.info("Indexing to collection: " + collection + " w/ ZK host: " + zkHost); solr = new CloudSolrServer(zkHost); ((CloudSolrServer) solr).setDefaultCollection(collection); } else {/* w ww. ja v a 2 s .c o m*/ String solrURL = job.get("solr.server.url"); int queueSize = job.getInt("solr.client.queue.size", 100); int threadCount = job.getInt("solr.client.threads", 1); solr = new StreamingUpdateSolrServer(solrURL, queueSize, threadCount); } includeMetadata = job.getBoolean("lw.metadata", false); includeAnnotations = job.getBoolean("lw.annotations", false); // get the Behemoth annotations types and features // to store as SOLR fields // solr.f.name = BehemothType.featureName // e.g. solr.f.person = Person.string Iterator<Entry<String, String>> iterator = job.iterator(); while (iterator.hasNext()) { Entry<String, String> entry = iterator.next(); if (entry.getKey().startsWith("solr.f.") == false) continue; String fieldName = entry.getKey().substring("solr.f.".length()); String val = entry.getValue(); // see if a feature has been specified // if not we'll use '*' to indicate that we want // the text covered by the annotation HashMap<String, String> featureValMap = new HashMap<String, String>(); int separator = val.indexOf("."); String featureName = "*"; if (separator != -1) featureName = val.substring(separator + 1); featureValMap.put(featureName, fieldName); fieldMapping.put(entry.getValue(), featureValMap); LOG.debug("Adding to mapping " + entry.getValue() + " " + featureName + " " + fieldName); } }
From source file:com.digitalpebble.behemoth.solr.SOLRWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { String zkHost = job.get("solr.zkhost"); if (zkHost != null && zkHost.equals("") == false) { String collection = job.get("solr.zk.collection", "collection1"); LOG.info("Indexing to collection: " + collection + " w/ ZK host: " + zkHost); solr = new CloudSolrServer(zkHost); ((CloudSolrServer) solr).setDefaultCollection(collection); } else {/*from w w w . j a v a2 s . c o m*/ String solrURL = job.get("solr.server.url"); int queueSize = job.getInt("solr.client.queue.size", 100); int threadCount = job.getInt("solr.client.threads", 1); solr = new ConcurrentUpdateSolrServer(solrURL, queueSize, threadCount); } String paramsString = job.get("solr.params"); if (paramsString != null) { params = new ModifiableSolrParams(); String[] pars = paramsString.trim().split("\\&"); for (String kvs : pars) { String[] kv = kvs.split("="); if (kv.length < 2) { LOG.warn("Invalid Solr param " + kvs + ", skipping..."); continue; } params.add(kv[0], kv[1]); } LOG.info("Using Solr params: " + params.toString()); } includeMetadata = job.getBoolean("solr.metadata", false); includeAnnotations = job.getBoolean("solr.annotations", false); useMetadataPrefix = job.getBoolean("solr.metadata.use.prefix", false); metadataPrefix = job.get("solr.metadata.prefix", "attr_"); annotationPrefix = job.get("solr.annotation.prefix", "annotate_"); useAnnotationPrefix = job.getBoolean("solr.annotation.use.prefix", false); populateSolrFieldMappingsFromBehemothAnnotationsTypesAndFeatures(job); }
From source file:com.ebay.erl.mobius.core.datajoin.EvenlyPartitioner.java
License:Apache License
/** * Read in the partition file and build indexing data structures. * If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and * <tt>total.order.partitioner.natural.order</tt> is not false, a trie * of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes * will be built. Otherwise, keys will be located using a binary search of * the partition keyset using the {@link org.apache.hadoop.io.RawComparator} * defined for this job. The input file must be sorted with the same * comparator and contain {@link//from ww w .j a v a 2 s. co m org.apache.hadoop.mapred.JobConf#getNumReduceTasks} - 1 keys. */ @SuppressWarnings("unchecked") // keytype from conf not static public void configure(JobConf job) { try { String parts = getPartitionFile(job); final Path partFile = new Path(parts); final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? FileSystem.getLocal(job) // assume in DistributedCache : partFile.getFileSystem(job); //Class<K> keyClass = (Class<K>)job.getMapOutputKeyClass(); K[] splitPoints = readPartitions(fs, partFile, (Class<K>) Tuple.class, job); if (splitPoints.length != job.getNumReduceTasks() - 1) { throw new IOException("Wrong number of partitions in keyset"); } RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator(); for (int i = 0; i < splitPoints.length - 1; ++i) { if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) { throw new IOException("Split points are out of order"); } } boolean natOrder = job.getBoolean("total.order.partitioner.natural.order", true); if (natOrder && BinaryComparable.class.isAssignableFrom(Tuple.class)) { partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0], job.getInt("total.order.partitioner.max.trie.depth", 2)); } else { partitions = new BinarySearchNode(splitPoints, comparator); } } catch (IOException e) { throw new IllegalArgumentException("Can't read partitions file", e); } }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
private static void writePartitionFile(JobConf job, Sampler sampler) { try {/* w ww .ja va 2 s. c o m*/ //////////////////////////////////////////////// // first, getting samples from the data sources //////////////////////////////////////////////// LOGGER.info("Running local sampling for job [" + job.getJobName() + "]"); InputFormat inf = job.getInputFormat(); Object[] samples = sampler.getSample(inf, job); LOGGER.info("Samples retrieved, sorting..."); //////////////////////////////////////////////// // sort the samples //////////////////////////////////////////////// RawComparator comparator = job.getOutputKeyComparator(); Arrays.sort(samples, comparator); if (job.getBoolean("mobius.print.sample", false)) { PrintWriter pw = new PrintWriter( new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream( new File(job.get("mobius.sample.file", "./samples.txt.gz"))))))); for (Object obj : samples) { pw.println(obj); } pw.flush(); pw.close(); } //////////////////////////////////////////////// // start to write partition files //////////////////////////////////////////////// FileSystem fs = FileSystem.get(job); Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job))); while (fs.exists(partitionFile)) { partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis()); } fs.deleteOnExit(partitionFile); TotalOrderPartitioner.setPartitionFile(job, partitionFile); LOGGER.info("write partition file to:" + partitionFile.toString()); int reducersNbr = job.getNumReduceTasks(); Set<Object> wroteSamples = new HashSet<Object>(); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class, NullWritable.class); float avgReduceSize = samples.length / reducersNbr; int lastBegin = 0; for (int i = 0; i < samples.length;) { // trying to distribute the load for every reducer evenly, // dividing the <code>samples</code> into a set of blocks // separated by boundaries, objects that selected from the // <code>samples</code> array, and each blocks should have // about the same size. // find the last index of element that equals to samples[i], as // such element might appear multiple times in the samples. int upperBound = Util.findUpperBound(samples, samples[i], comparator); int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator); // the repeat time of samples[i], if the key itself is too big // select it as boundary int currentElemSize = upperBound - lowerBound + 1; if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size { // the current element is too big, greater than // two times of the <code>avgReduceSize</code>, // put itself as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); // immediate put the next element to the boundary, // the next element starts at <code> upperBound+1 // </code>, to prevent the current one consume even // more. if (upperBound + 1 < samples.length) { writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey()); //pw.println(samples[upperBound+1]); // move on to the next element of <code>samples[upperBound+1]/code> lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1; i = lastBegin; } else { break; } } else { // current element is small enough to be consider // with previous group int size = upperBound - lastBegin; if (size > avgReduceSize) { // by including the current elements, we have // found a block that's big enough, select it // as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); i = upperBound + 1; lastBegin = i; } else { i = upperBound + 1; } } } writer.close(); // if the number of wrote samples doesn't equals to number of // reducer minus one, then it means the key spaces is too small // hence TotalOrderPartitioner won't work, it works only if // the partition boundaries are distinct. // // we need to change the number of reducers if (wroteSamples.size() + 1 != reducersNbr) { LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size() + ", reducer size:" + (reducersNbr)); LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1)); // add 1 because the wrote samples define boundary, ex, if // the sample size is two with two element [300, 1000], then // there should be 3 reducers, one for handling i<300, one // for n300<=i<1000, and another one for 1000<=i job.setNumReduceTasks((wroteSamples.size() + 1)); } samples = null; } catch (IOException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseInputFormatUtil.java
License:Apache License
/** * Parse {@code jobConf} to create a {@link Scan} instance. *//*from w w w. ja v a 2 s . co m*/ public static Scan getScan(JobConf jobConf) throws IOException { String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true); List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); ColumnMappings columnMappings; try { columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching); } catch (SerDeException e) { throw new IOException(e); } if (columnMappings.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); } boolean readAllColumns = ColumnProjectionUtils.isReadAllColumns(jobConf); Scan scan = new Scan(); boolean empty = true; // The list of families that have been added to the scan List<String> addedFamilies = new ArrayList<String>(); if (!readAllColumns) { ColumnMapping[] columnsMapping = columnMappings.getColumnsMapping(); for (int i : readColIDs) { ColumnMapping colMap = columnsMapping[i]; if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { continue; } if (colMap.qualifierName == null) { scan.addFamily(colMap.familyNameBytes); addedFamilies.add(colMap.familyName); } else { if (!addedFamilies.contains(colMap.familyName)) { // add only if the corresponding family has not already been added scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); } } empty = false; } } // The HBase table's row key maps to a Hive table column. In the corner case when only the // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ // column qualifier will have been added to the scan. We arbitrarily add at least one column // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive // tables column projection. if (empty) { for (ColumnMapping colMap : columnMappings) { if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { continue; } if (colMap.qualifierName == null) { scan.addFamily(colMap.familyNameBytes); } else { scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); } if (!readAllColumns) { break; } } } String scanCache = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHE); if (scanCache != null) { scan.setCaching(Integer.valueOf(scanCache)); } String scanCacheBlocks = jobConf.get(HBaseSerDe.HBASE_SCAN_CACHEBLOCKS); if (scanCacheBlocks != null) { scan.setCacheBlocks(Boolean.valueOf(scanCacheBlocks)); } String scanBatch = jobConf.get(HBaseSerDe.HBASE_SCAN_BATCH); if (scanBatch != null) { scan.setBatch(Integer.valueOf(scanBatch)); } return scan; }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java
License:Apache License
private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException { //obtain delegation tokens for the job if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) { TableMapReduceUtil.initCredentials(jobConf); }//from www.ja va 2 s. c o m String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true); if (hbaseColumnsMapping == null) { throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table."); } ColumnMappings columnMappings = null; int iTimeColumn = -1; try { columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching); iTimeColumn = HBaseSerDe.getTxTimeColumnIndex(columnMappings, jobConf); } catch (SerDeException e) { throw new IOException(e); } int iKey = columnMappings.getKeyIndex(); int iTimestamp = columnMappings.getTimestampIndex(); ColumnMapping keyMapping = columnMappings.getKeyMapping(); if (iTimeColumn != -1) { List<org.apache.hadoop.mapreduce.InputSplit> splits = TxHiveTableInputFormatUtil.getSplits(jobConf, numSplits, columnMappings, iTimeColumn, hbaseTableName); if (splits != null) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0], true); } LOG.info("getSplits: TxHiveIndexScan"); return results; } } LOG.info("getSplits: no TxHiveIndexScan"); setHTable(new HTable(HBaseConfiguration.create(jobConf), Bytes.toBytes(hbaseTableName))); // Take filter pushdown into account while calculating splits; this // allows us to prune off regions immediately. Note that although // the Javadoc for the superclass getSplits says that it returns one // split per region, the implementation actually takes the scan // definition into account and excludes regions which don't satisfy // the start/stop row conditions (HBASE-1829). Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey( keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string"))); // The list of families that have been added to the scan List<String> addedFamilies = new ArrayList<String>(); // REVIEW: are we supposed to be applying the getReadColumnIDs // same as in getRecordReader? for (ColumnMapping colMap : columnMappings) { if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { continue; } if (colMap.qualifierName == null) { scan.addFamily(colMap.familyNameBytes); addedFamilies.add(colMap.familyName); } else { if (!addedFamilies.contains(colMap.familyName)) { // add the column only if the family has not already been added scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); } } } setScan(scan); Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]); } return results; }