Example usage for org.apache.hadoop.mapred JobConf getInt

List of usage examples for org.apache.hadoop.mapred JobConf getInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getInt.

Prototype

public int getInt(String name, int defaultValue) 

Source Link

Document

Get the value of the name property as an int.

Usage

From source file:edu.ub.ahstfg.indexer.mapred.IndexRecordReader.java

License:Open Source License

/**
 * Sole constructor.//from  w  ww .jav  a 2 s  .  co m
 * @param job Job reading the records.
 * @param input Split where are the records.
 * @param reporter Job reporter.
 * @throws IOException
 */
public IndexRecordReader(JobConf job, FileSplit input, Reporter reporter) throws IOException {
    this.reporter = reporter;

    lineReader = new LineRecordReader(job, input);
    lineKey = lineReader.createKey();
    lineValue = lineReader.createValue();

    numMachines = job.getInt(ParamSet.NUM_MACHINES, 10);
    numDocs = job.getInt(ParamSet.NUM_DOCS, 1000);
    qDocsPerMapper = numDocs / numMachines;
    rDocsPerMapper = numDocs - (qDocsPerMapper * numMachines);

    fillDocsPerMapper();
}

From source file:edu.ubc.mirrors.holographs.mapreduce.SnapshotObjectsOfTypeInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    ISnapshot snapshot = SnapshotUtils.openSnapshot(job);
    int[] objectIDs = SnapshotUtils.getInputObjectIDs(job, snapshot);
    SnapshotFactory.dispose(snapshot);/* w  w w .j a v  a 2  s. c  o m*/

    int maxNumObjects = job.getInt("maxNumObjects", objectIDs.length);
    int numObjects = Math.min(objectIDs.length, maxNumObjects);

    int approxSplitSize = job.getInt("splitSize", -1);
    if (approxSplitSize == -1) {
        approxSplitSize = Math.max(1, (int) (((float) numObjects) / numSplits));
    } else {
        numSplits = Math.max(1, (int) (((float) numObjects) / approxSplitSize));
    }

    InputSplit[] splits = new InputSplit[numSplits];
    int offset = 0;
    for (int index = 0; index < numSplits; index++) {
        int splitSize = Math.min(numObjects - offset, approxSplitSize);
        if (index == numSplits - 1) {
            splitSize = numObjects - offset;
        }
        int[] slice = new int[splitSize];
        System.arraycopy(objectIDs, offset, slice, 0, splitSize);
        splits[index] = new ObjectIDArraySplit(slice);

        offset += splitSize;
    }
    return splits;
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapNewRecord.java

License:Apache License

@Override
public void configure(JobConf job) {
    ////from  w w w  . ja v a  2s  . c  om
    // create RecordGenerator
    //
    int offset = job.getInt(FuzzyJoinDriver.DATA_CRTCOPY_PROPERTY, -1);
    if (offset == -1) {
        System.err.println("ERROR: fuzzyjoin.data.crtcopy not set.");
        System.exit(-1);
    }
    recordGenerator = new RecordGenerator(offset);
    int noRecords = job.getInt(FuzzyJoinDriver.DATA_NORECORDS_PROPERTY, -1);
    if (noRecords == -1) {
        System.err.println("ERROR: fuzzyjoin.data.norecords not set.");
        System.exit(-1);
    }
    offsetRID = offset * noRecords;
    int dictionaryFactor = job.getInt(FuzzyJoinDriver.DATA_DICTIONARY_FACTOR_PROPERTY, 1);
    //
    // set RecordGenerator
    //
    Path tokenRankFile;
    try {
        Path[] cache = DistributedCache.getLocalCacheFiles(job);
        if (cache == null) {
            tokenRankFile = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY));
        } else {
            tokenRankFile = cache[0];
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    TokenLoad tokenLoad = new TokenLoad(tokenRankFile.toString(), recordGenerator);
    tokenLoad.loadTokenRank(dictionaryFactor);
    //
    // set Tokenizer
    //
    tokenizer = TokenizerFactory.getTokenizer(
            job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE),
            FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
    //
    // set dataColumn
    //
    dataColumns = FuzzyJoinUtil
            .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE));
    // Arrays.sort(dataColumns);
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.datagen.MapRecordOnly.java

License:Apache License

@Override
public void configure(JobConf job) {
    noRecords = job.getInt(FuzzyJoinDriver.DATA_NORECORDS_PROPERTY, -1);
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapJoin.java

License:Apache License

@Override
public void configure(JobConf job) {
    //// w w  w .j  a v  a  2 s .  c  o m
    // set Tokenizer and SimilarityFilters
    //
    tokenizer = TokenizerFactory.getTokenizer(
            job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE),
            FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
    String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY,
            FuzzyJoinConfig.SIMILARITY_NAME_VALUE);
    float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
            FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE);
    similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold);
    //
    // set TokenRank and TokenGroup
    //
    Path tokensPath;
    Path lengthstatsPath = null;
    try {
        Path[] cache = DistributedCache.getLocalCacheFiles(job);
        if (cache == null) {
            tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY));
            try {
                lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY));
            } catch (IllegalArgumentException e) {
            }
        } else {
            tokensPath = cache[0];
            if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY,
                    FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) {
                lengthstatsPath = cache[1];
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY,
            FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE);
    int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY,
            FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE);
    recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass,
            Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath);
    TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank);
    tokenLoad.loadTokenRank();
    //
    // set dataColumn
    //
    dataColumns[0] = FuzzyJoinUtil
            .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE));
    dataColumns[1] = FuzzyJoinUtil
            .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE));
    //
    // get suffix for second relation
    //
    suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "")
            .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1];
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridpairs.ppjoin.MapSelfJoin.java

License:Apache License

@Override
public void configure(JobConf job) {
    ///*from w  ww.  ja  va 2s.c  o  m*/
    // set Tokenizer and SimilarityFilters
    //
    tokenizer = TokenizerFactory.getTokenizer(
            job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE),
            FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
    String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY,
            FuzzyJoinConfig.SIMILARITY_NAME_VALUE);
    float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
            FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE);
    similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold);
    //
    // set TokenRank and TokenGroup
    //
    Path tokensPath;
    Path lengthstatsPath = null;
    try {
        Path[] cache = DistributedCache.getLocalCacheFiles(job);
        if (cache == null) {
            tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY));
            try {
                lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY));
            } catch (IllegalArgumentException e) {
            }
        } else {
            tokensPath = cache[0];
            if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY,
                    FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) {
                lengthstatsPath = cache[1];
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY,
            FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE);
    int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY,
            FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE);
    recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass,
            Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath);
    TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank);
    tokenLoad.loadTokenRank();
    //
    // set dataColumn
    //
    dataColumns = FuzzyJoinUtil
            .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE));
}

From source file:edu.uci.ics.fuzzyjoin.hadoop.ridrecordpairs.ppjoin.MapJoin.java

License:Apache License

@Override
public void configure(JobConf job) {
    ///*from w ww  . java 2s .com*/
    // set Tokenizer and SimilarityFilters
    //
    tokenizer = TokenizerFactory.getTokenizer(
            job.get(FuzzyJoinConfig.TOKENIZER_PROPERTY, FuzzyJoinConfig.TOKENIZER_VALUE),
            FuzzyJoinConfig.WORD_SEPARATOR_REGEX, FuzzyJoinConfig.TOKEN_SEPARATOR);
    String similarityName = job.get(FuzzyJoinConfig.SIMILARITY_NAME_PROPERTY,
            FuzzyJoinConfig.SIMILARITY_NAME_VALUE);
    float similarityThreshold = job.getFloat(FuzzyJoinConfig.SIMILARITY_THRESHOLD_PROPERTY,
            FuzzyJoinConfig.SIMILARITY_THRESHOLD_VALUE);
    similarityFilters = SimilarityFiltersFactory.getSimilarityFilters(similarityName, similarityThreshold);
    //
    // set TokenRank and TokenGroup
    //
    Path tokensPath;
    Path lengthstatsPath = null;
    try {
        Path[] cache = DistributedCache.getLocalCacheFiles(job);
        if (cache == null) {
            tokensPath = new Path(job.get(FuzzyJoinConfig.DATA_TOKENS_PROPERTY));
            try {
                lengthstatsPath = new Path(job.get(FuzzyJoinDriver.DATA_LENGTHSTATS_PROPERTY));
            } catch (IllegalArgumentException e) {
            }
        } else {
            tokensPath = cache[0];
            if (job.getBoolean(FuzzyJoinDriver.TOKENS_LENGTHSTATS_PROPERTY,
                    FuzzyJoinDriver.TOKENS_LENGTHSTATS_VALUE)) {
                lengthstatsPath = cache[1];
            }
        }
    } catch (IOException e) {
        throw new RuntimeException(e);

    }
    String recordGroupClass = job.get(FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_PROPERTY,
            FuzzyJoinDriver.RIDPAIRS_GROUP_CLASS_VALUE);
    int recordGroupFactor = job.getInt(FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_PROPERTY,
            FuzzyJoinDriver.RIDPAIRS_GROUP_FACTOR_VALUE);
    recordGroup = RecordGroupFactory.getRecordGroup(recordGroupClass,
            Math.max(1, job.getNumReduceTasks() * recordGroupFactor), similarityFilters, "" + lengthstatsPath);
    TokenLoad tokenLoad = new TokenLoad(tokensPath.toString(), tokenRank);
    tokenLoad.loadTokenRank();
    //
    // set dataColumn
    //
    dataColumns[0] = FuzzyJoinUtil
            .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA_PROPERTY, FuzzyJoinConfig.RECORD_DATA_VALUE));
    dataColumns[1] = FuzzyJoinUtil
            .getDataColumns(job.get(FuzzyJoinConfig.RECORD_DATA1_PROPERTY, FuzzyJoinConfig.RECORD_DATA1_VALUE));
    //
    // get suffix for second relation
    //
    suffixSecond = job.get(FuzzyJoinDriver.DATA_SUFFIX_INPUT_PROPERTY, "")
            .split(FuzzyJoinDriver.SEPSARATOR_REGEX)[1];
}

From source file:edu.uci.ics.hyracks.hadoop.compat.util.HadoopAdapter.java

License:Apache License

public static ExternalSortOperatorDescriptor getExternalSorter(JobConf conf, IOperatorDescriptorRegistry spec) {
    ExternalSortOperatorDescriptor externalSortOp = null;
    RecordDescriptor recordDescriptor = getHadoopRecordDescriptor(conf.getMapOutputKeyClass().getName(),
            conf.getMapOutputValueClass().getName());
    Class<? extends RawComparator> rawComparatorClass = null;
    WritableComparator writableComparator = WritableComparator
            .get(conf.getMapOutputKeyClass().asSubclass(WritableComparable.class));
    WritableComparingBinaryComparatorFactory comparatorFactory = new WritableComparingBinaryComparatorFactory(
            writableComparator.getClass());
    externalSortOp = new ExternalSortOperatorDescriptor(spec,
            conf.getInt(HYRACKS_EX_SORT_FRAME_LIMIT, DEFAULT_EX_SORT_FRAME_LIMIT), new int[] { 0 },
            new IBinaryComparatorFactory[] { comparatorFactory }, recordDescriptor);
    return externalSortOp;
}

From source file:edu.ucsb.cs.hybrid.HybridDriver.java

License:Apache License

public static void main(String args[]) throws ParseException, IOException {

    // job.set("mapred.job.tracker", "local");
    // job.set("fs.default.name", "file:///");

    JobConf job = new JobConf();
    job.setJarByClass(HybridDriver.class);
    new GenericOptionsParser(job, args);
    setMapperAndRunner(job);//from   w  w  w.ja  v a 2s  . c o m
    job.setMapOutputKeyClass(DocDocWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(DocDocWritable.class);
    job.setOutputValueClass(FloatWritable.class);

    Path inputPath = new Path(INPUT_DIR);
    CustomSequenceFileInputFormat.addInputPath(job, inputPath);
    Path outputPath = new Path(OUTPUT_DIR);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);
    FileSystem.get(job).delete(outputPath, true);

    job.setBoolean("fs.hdfs.impl.disable.cache", true); //xun not sure if needed

    if (job.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE)) {
        job.setInputFormat(CustomSequenceFileInputFormat.class);
        Long splitMB = job.getLong(Config.SPLIT_MB_PROPERTY, Config.SPLIT_MB_VALUE) * 1024 * 1024;
        job.setLong("mapred.min.split.size", splitMB);
        job.setLong("mapred.max.split.size", splitMB);
        job.setLong("dfs.block.size", splitMB);
    } else {
        //  Comment the following of splitter for www experiments it assumes no splitting
        // of partitions for load balancing, should be fixed.
        Splitter.configure(job, inputPath);// remove comment unless for www
        job.setInputFormat(NonSplitableSequenceInputFormat.class); //remove comment
    }
    //SIGIR'14 two-stage balancing //not yet fully incorporated 
    if (job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE) != 0) {
        TwoStageLoadbalancing.main(job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE),
                new Path(PartDriver.OUTPUT_DIR), job);
    }
    JobSubmitter.run(job, "SIMILARITY", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE));
    if (job.getBoolean(Config.CONVERT_TEXT_PROPERTY, Config.CONVERT_TEXT_VALUE))
        IDMappingJob(args);
}

From source file:edu.ucsb.cs.hybrid.HybridDriver.java

License:Apache License

/**
 * @param job : passed by reference to set its mapper class.
 *//*  w  w  w.  ja  v  a2s  . co  m*/
public static void setMapperAndRunner(JobConf job) {
    int numSplits = job.getInt(Config.NUMBER_SPLITS_PROPERTY, Config.NUMBER_SPLITS_VALUE);
    int PSSChoice = job.getInt(Config.BLOCK_CHOICE_PROPERTY, Config.BLOCK_CHOICE_VALUE);//1,2
    String name = "PSS";
    if (numSplits > 1) {
        //check can I set # splits for runner here?
        job.setMapRunnerClass(MultipleS_Runner.class);
        if (job.getBoolean(Config.MULTI_THREADS_PROPERTY, Config.MULTI_THREADS_VALUE)) { // threads testing
            job.setMapperClass(PSS1_Threaded_Mapper.class);// naming
        } else if (PSSChoice == 1) {
            name += "1";
            job.setMapperClass(PSS1_Mapper.class);
        } else if (PSSChoice == 2) {
            name += "2";
            job.setMapperClass(PSS2_Mapper.class);// MultipleS_Block1_Mapper
        } else
            ;//For future implementations 
    } else {
        job.setMapRunnerClass(SingleS_Runner.class);
        if (job.getBoolean(Config.MULTI_THREADS_PROPERTY, Config.MULTI_THREADS_VALUE)) // threads
            throw new RuntimeException(
                    "ERROR: Single S with multithreads! Set hybrid.threads.property to false.");
        if (PSSChoice == 1) {
            job.setMapperClass(PSS_Mapper.class);
            if (job.getBoolean(Config.BAYADRO_SKIP_PROPERTY, Config.BAYADRO_SKIP_VALUE)) {
                name += "/Bayardo_Dynamic_filter";
                job.setMapperClass(PSS_Bayardo_Mapper.class);//PSS+Bayardo WWW'07
            }
        } else if (PSSChoice == 2) {
            name += "2/SingleS";
            job.setMapperClass(PSS2_SingleS_Mapper.class);
        } else
            job.setMapperClass(PSS3_SingleS_Mapper.class); //what is this?
    }
    job.setJobName(name);
}