Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults)

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:co.cask.cdap.explore.guice.ExploreRuntimeModule.java

License:Apache License

private static void setupClasspath(File tmpDir) throws IOException {
    // Here we find the transitive dependencies and remove all paths that come from the boot class path -
    // those paths are not needed because the new JVM will have them in its boot class path.
    // It could even be wrong to keep them because in the target container, the boot class path may be different
    // (for example, if Hadoop uses a different Java version than CDAP).

    final Set<String> bootstrapClassPaths = ExploreServiceUtils.getBoostrapClasses();

    ClassAcceptor classAcceptor = new ClassAcceptor() {
        /* Excluding any class contained in the bootstrapClassPaths and Kryo classes and hive-exec.jar
         * We need to remove Kryo dependency in the Explore container. Spark introduced version 2.21 version of Kryo,
         * which would be normally shipped to the Explore container. Yet, Hive requires Kryo 2.22,
         * and gets it from the Hive jars - hive-exec.jar to be precise.
         * we also exclude hive jars as hive dependencies are found in job.jar.
         * *///from  w w w.j  a  v a  2 s.  c  o m
        @Override
        public boolean accept(String className, URL classUrl, URL classPathUrl) {
            if (bootstrapClassPaths.contains(classPathUrl.getFile())
                    || className.startsWith("com.esotericsoftware.kryo")
                    || classPathUrl.getFile().contains("hive")) {
                return false;
            }
            return true;
        }
    };

    Set<File> hBaseTableDeps = ExploreServiceUtils.traceDependencies(null, classAcceptor, tmpDir,
            HBaseTableUtilFactory.getHBaseTableUtilClass().getName());

    // Note the order of dependency jars is important so that HBase jars come first in the classpath order
    // LinkedHashSet maintains insertion order while removing duplicate entries.
    Set<File> orderedDependencies = new LinkedHashSet<>();
    orderedDependencies.addAll(hBaseTableDeps);
    orderedDependencies.addAll(ExploreServiceUtils.traceDependencies(null, classAcceptor, tmpDir,
            RemoteDatasetFramework.class.getName(), DatasetStorageHandler.class.getName(),
            RecordFormats.class.getName()));

    // Note: the class path entries need to be prefixed with "file://" for the jars to work when
    // Hive starts local map-reduce job.
    ImmutableList.Builder<String> builder = ImmutableList.builder();
    for (File dep : orderedDependencies) {
        builder.add("file://" + dep.getAbsolutePath());
    }
    List<String> orderedDependenciesStr = builder.build();

    // These dependency files need to be copied over to spark container
    System.setProperty(BaseHiveExploreService.SPARK_YARN_DIST_FILES,
            Joiner.on(',').join(Iterables.transform(orderedDependencies, new Function<File, String>() {
                @Override
                public String apply(File input) {
                    return input.getAbsolutePath();
                }
            })));
    LOG.debug("Setting {} to {}", BaseHiveExploreService.SPARK_YARN_DIST_FILES,
            System.getProperty(BaseHiveExploreService.SPARK_YARN_DIST_FILES));

    // These dependency files need to be copied over to hive job container
    System.setProperty(HiveConf.ConfVars.HIVEAUXJARS.toString(), Joiner.on(',').join(orderedDependenciesStr));
    LOG.debug("Setting {} to {}", HiveConf.ConfVars.HIVEAUXJARS.toString(),
            System.getProperty(HiveConf.ConfVars.HIVEAUXJARS.toString()));

    // add hive-exec.jar to the HADOOP_CLASSPATH, which is used by the local mapreduce job launched by hive ,
    // we need to add this, otherwise when hive runs a MapRedLocalTask it cannot find
    // "org.apache.hadoop.hive.serde2.SerDe" class in its classpath.
    List<String> orderedDependenciesWithHiveJar = Lists.newArrayList(orderedDependenciesStr);
    String hiveExecJar = new JobConf(org.apache.hadoop.hive.ql.exec.Task.class).getJar();
    Preconditions.checkNotNull(hiveExecJar, "Couldn't locate hive-exec.jar to be included in HADOOP_CLASSPATH "
            + "for MapReduce jobs launched by Hive");
    orderedDependenciesWithHiveJar.add(hiveExecJar);
    LOG.debug("Added hive-exec.jar {} to HADOOP_CLASSPATH to be included for MapReduce jobs launched by Hive",
            hiveExecJar);

    //TODO: Setup HADOOP_CLASSPATH hack, more info on why this is needed, see CDAP-9
    LocalMapreduceClasspathSetter classpathSetter = new LocalMapreduceClasspathSetter(new HiveConf(),
            tmpDir.getAbsolutePath(), orderedDependenciesWithHiveJar);
    for (File jar : hBaseTableDeps) {
        classpathSetter.accept(jar.getAbsolutePath());
    }
    classpathSetter.setupClasspathScript();
}

From source file:co.nubetech.hiho.job.DBQueryInputJob.java

License:Apache License

public void runJobs(Configuration conf, int jobCounter) throws IOException {

    try {/*from w w  w .j  a v a 2 s .  c om*/
        checkMandatoryConfs(conf);
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new IOException(e1);
    }

    Job job = new Job(conf);
    for (Entry<String, String> entry : conf) {
        logger.warn("key, value " + entry.getKey() + "=" + entry.getValue());
    }

    // logger.debug("Number of maps " +
    // conf.getInt("mapred.map.tasks", 1));
    // conf.setInt(JobContext.NUM_MAPS,
    // conf.getInt("mapreduce.job.maps", 1));
    // job.getConfiguration().setInt("mapred.map.tasks", 4);
    job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1));
    logger.warn("Number of maps " + conf.getInt(MRJobConfig.NUM_MAPS, 1));

    job.setJobName("Import job");
    job.setJarByClass(DBQueryInputJob.class);

    String strategy = conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY);
    OutputStrategyEnum os = OutputStrategyEnum.value(strategy);
    if (os == null) {
        throw new IllegalArgumentException("Wrong value of output strategy. Please correct");
    }
    if (os != OutputStrategyEnum.AVRO) {
        switch (os) {

        case DUMP: {
            // job.setMapperClass(DBImportMapper.class);
            break;
        }
        /*
         * case AVRO: { job.setMapperClass(DBInputAvroMapper.class); //
         * need avro in cp // job.setJarByClass(Schema.class); // need
         * jackson which is needed by avro - ugly! //
         * job.setJarByClass(ObjectMapper.class);
         * job.setMapOutputKeyClass(NullWritable.class);
         * job.setMapOutputValueClass(AvroValue.class);
         * job.setOutputKeyClass(NullWritable.class);
         * job.setOutputValueClass(AvroValue.class);
         * job.setOutputFormatClass(AvroOutputFormat.class);
         * 
         * AvroOutputFormat.setOutputPath(job, new
         * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; }
         */
        case DELIMITED: {
            job.setMapperClass(DBInputDelimMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class);

            NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
        }
        case JSON: {
            // job.setMapperClass(DBImportJsonMapper.class);
            // job.setJarByClass(ObjectMapper.class);
            break;
        }
        default: {
            job.setMapperClass(DBInputDelimMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class);

            NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
            break;
        }
        }

        String inputQuery = conf.get(DBConfiguration.INPUT_QUERY);
        String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY);
        logger.debug("About to set the params");
        DBQueryInputFormat.setInput(job, inputQuery, inputBoundingQuery, params);
        logger.debug("Set the params");

        job.setNumReduceTasks(0);

        try {
            // job.setJarByClass(Class.forName(conf.get(
            // org.apache.hadoop.mapred.lib.db.DBConfiguration.DRIVER_CLASS_PROPERTY)));
            logger.debug("OUTPUT format class is " + job.getOutputFormatClass());

            /*
             * org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
             * ReflectionUtils.newInstance(job.getOutputFormatClass(),
             * job.getConfiguration()); output.checkOutputSpecs(job);
             */
            logger.debug("Class is " + ReflectionUtils
                    .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
            job.waitForCompletion(false);
            if (conf.get(HIHOConf.INPUT_OUTPUT_LOADTO) != null) {
                generateHiveScript(conf, job, jobCounter);
                generatePigScript(conf, job);
            }

        }
        /*
         * catch (HIHOException h) { h.printStackTrace(); }
         */
        catch (Exception e) {
            e.printStackTrace();
        } catch (HIHOException e) {
            e.printStackTrace();
        }
    }
    // avro to be handled differently, thanks to all the incompatibilities
    // in the apis.
    else {
        String inputQuery = conf.get(DBConfiguration.INPUT_QUERY);
        String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY);
        logger.debug("About to set the params");
        // co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(job,
        // inputQuery, inputBoundingQuery, params);
        logger.debug("Set the params");

        JobConf jobConf = new JobConf(conf);

        try {
            GenericDBWritable queryWritable = getDBWritable(jobConf);
            Schema pair = DBMapper.getPairSchema(queryWritable.getColumns());

            AvroJob.setMapOutputSchema(jobConf, pair);
            GenericRecordAvroOutputFormat.setOutputPath(jobConf,
                    new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));

            co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(jobConf, inputQuery,
                    inputBoundingQuery, params);
            jobConf.setInputFormat(co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.class);
            jobConf.setMapperClass(DBInputAvroMapper.class);
            jobConf.setMapOutputKeyClass(NullWritable.class);
            jobConf.setMapOutputValueClass(AvroValue.class);
            jobConf.setOutputKeyClass(NullWritable.class);
            jobConf.setOutputValueClass(Text.class);
            jobConf.setOutputFormat(GenericRecordAvroOutputFormat.class);
            jobConf.setJarByClass(DBQueryInputJob.class);
            jobConf.setStrings("io.serializations",
                    "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization,org.apache.avro.mapred.AvroSerialization");
            jobConf.setNumReduceTasks(0);
            /*
             * jobConf.setOutputFormat(org.apache.hadoop.mapred.
             * SequenceFileOutputFormat.class);
             * org.apache.hadoop.mapred.SequenceFileOutputFormat
             * .setOutputPath(jobConf, new
             * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
             */
            JobClient.runJob(jobConf);
        } catch (Throwable e) {
            e.printStackTrace();
        }

    }

}

From source file:co.nubetech.hiho.mapreduce.lib.input.TestFileStreamInputFormat.java

License:Apache License

@Test
public void testNumInputs() throws Exception {
    Configuration conf = new Configuration();
    JobConf job = new JobConf(conf);
    MiniDFSCluster dfs = newDFSCluster(job);
    FileSystem fs = dfs.getFileSystem();
    System.out.println("FileSystem " + fs.getUri());
    Path inputDir = new Path("/foo/");
    final int numFiles = 10;
    String fileNameBase = "part-0000";

}

From source file:colossal.pipe.ColPhase.java

License:Apache License

public List<PhaseError> plan(ColPipe distPipeline) {
    List<PhaseError> errors = new ArrayList<PhaseError>();
    conf = new JobConf(distPipeline.getConf());
    for (Map.Entry<String, String> entry : props.entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }/* www.  ja v  a2s  .c o  m*/

    Schema mapin = null;
    Class<?> mapOutClass = null;
    Class<?> mapInClass = null;

    Class<? extends ColMapper> mapperClass = null;
    if (mappers != null && mappers.length > 0) {
        if (mappers.length > 1) {
            errors.add(new PhaseError(
                    "Colossal phase/avro currently only supports one mapper per process: " + name));
        } else {
            mapperClass = mappers[0];
            conf.set(MAPPER, mapperClass.getName());
            Class<?> foundIn = null;
            for (Method m : mapperClass.getMethods()) {
                if ("map".equals(m.getName())) {
                    Class<?>[] paramTypes = m.getParameterTypes();
                    if (paramTypes.length >= 3) {
                        try {
                            // prefer subclass methods to superclass methods
                            if (foundIn == null || foundIn.isAssignableFrom(m.getDeclaringClass())) {
                                if (paramTypes[0] == Object.class) {
                                    if (foundIn == m.getDeclaringClass()) {
                                        // skip the generated "override" of the generic method
                                        continue;
                                    }
                                } else {
                                    //TODO: handle cases beyond Object where output isn't defined    
                                    mapInClass = paramTypes[0];
                                    mapin = getSchema(paramTypes[0].newInstance());
                                }
                                mapOutClass = paramTypes[1];
                                foundIn = m.getDeclaringClass();
                            }
                        } catch (Exception e) {
                            errors.add(new PhaseError(e, "Can't create mapper: " + mapperClass));
                        }
                    }
                }
            }
        }
    }

    if (combiners != null && combiners.length > 0) {
        if (combiners.length > 1) {
            errors.add(new PhaseError(
                    "Colossal phase/avro currently only supports one combiner per process: " + name));
        } else {
            conf.set(COMBINER, combiners[0].getName());
            conf.setCombinerClass(ColHadoopCombiner.class);
        }
    }
    Schema reduceout = null;
    Class<?> reduceOutClass = null;
    Class<? extends ColReducer> reducerClass = null;
    if (reducers != null && reducers.length > 0) {
        if (reducers.length != 1) {
            errors.add(new PhaseError(
                    "Colossal phase/avro currently only supports one reducer per process: " + name));
        } else {
            reducerClass = reducers[0];
            conf.set(REDUCER, reducers[0].getName());
            Class<?> foundIn = null;
            for (Method m : reducerClass.getMethods()) {
                if ("reduce".equals(m.getName())) {
                    Class<?>[] paramTypes = m.getParameterTypes();
                    if (paramTypes.length >= 3) {
                        if (foundIn == null || foundIn.isAssignableFrom(m.getDeclaringClass())) {
                            if (foundIn == m.getDeclaringClass() && paramTypes[1] == Object.class) {
                                // skip the generated "override" of the generic method
                                continue;
                            }
                            // prefer subclass methods to superclass methods
                            reduceOutClass = paramTypes[1];
                            foundIn = m.getDeclaringClass();
                        }
                    }
                }
            }
            // XXX validation!
        }
    }
    Object reduceOutProto = null;
    //TODO: handle cases beyond Object where output isn't defined
    if ((reduceOutClass == null || reduceOutClass == Object.class) && mainWrites != null
            && mainWrites.size() > 0) {
        reduceOutProto = mainWrites.get(0).getPrototype();
        reduceOutClass = reduceOutProto.getClass();
    } else {
        try {
            reduceOutProto = reduceOutClass.newInstance();
        } catch (Exception e) {
            errors.add(new PhaseError(e, "Can't create reducer output class: " + reduceOutClass));
        }
    }
    if (reduceOutProto != null)
        reduceout = getSchema(reduceOutProto);

    conf.set(REDUCE_OUT_CLASS, reduceOutClass.getName());

    Schema valueSchema = null;
    if (mainWrites.size() != 1) {
        errors.add(
                new PhaseError("Colossal phase/avro currently only supports one output per process: " + name));
    } else {
        ColFile output = mainWrites.get(0);
        AvroOutputFormat.setOutputPath(conf, new Path(output.getPath()));

        if (output.getPrototype() != null) {
            valueSchema = getSchema(output.getPrototype());
            if (reduceout != null) {
                assert reduceout.equals(valueSchema); // should make an error not assert this!
            }
        } else {
            if (reduceout == null) {
                errors.add(new PhaseError("No output format defined"));
            }
            valueSchema = reduceout;
        }
        output.setupOutput(conf);
    }
    conf.set(AvroJob.OUTPUT_SCHEMA, valueSchema.toString());

    if (deflateLevel != null)
        AvroOutputFormat.setDeflateLevel(conf, deflateLevel);

    Object proto = null;
    if (mainReads != null && mainReads.size() > 0) {
        Path[] inPaths = new Path[mainReads.size()];
        int i = 0;
        for (ColFile file : mainReads) {
            inPaths[i++] = new Path(file.getPath());
            Object myProto = file.getPrototype();
            if (myProto == null) {
                errors.add(new PhaseError("Files need non-null prototypes " + file));
            } else if (proto != null) {
                if (myProto.getClass() != proto.getClass()) {
                    errors.add(new PhaseError("Inconsistent prototype classes for inputs: " + myProto.getClass()
                            + " vs " + proto.getClass() + " for " + file));
                }
            } else {
                proto = myProto;
            }
        }
        AvroInputFormat.setInputPaths(conf, inPaths);

        if (mapin == null) {
            if (proto == null) {
                errors.add(new PhaseError("Undefined input format"));
            } else {
                mapin = getSchema(proto);
                mapInClass = proto.getClass();
            }
        }
        mainReads.get(0).setupInput(conf);
        if (conf.get("mapred.input.format.class") == null)
            conf.setInputFormat(AvroInputFormat.class);
    }

    Schema mapValueSchema = null;
    try {
        //TODO: handle cases beyond Object where input isn't defined
        if (mapOutClass == null || mapOutClass == Object.class) {
            assert mapperClass == null;
            if (proto != null) {
                mapOutClass = proto.getClass();
                mapValueSchema = getSchema(proto);
            } else {
                // not available - try to get it from the reducer
                if (reducerClass == null) {
                    mapOutClass = reduceOutClass;
                    mapValueSchema = getSchema(reduceOutClass.newInstance());
                } else {
                    // can't get it from reducer input - that's just Iterable
                    String fname = "no input file specified";
                    if (mainReads != null && mainReads.size() > 0)
                        fname = mainReads.get(0).getPath();
                    errors.add(new PhaseError(
                            "No input format specified for identity mapper - specify it on input file "
                                    + fname));
                }
            }
        } else {
            mapValueSchema = getSchema(mapOutClass.newInstance());
        }
        if (mapValueSchema != null)
            conf.set(MAP_OUT_VALUE_SCHEMA, mapValueSchema.toString());
    } catch (Exception e) {
        errors.add(new PhaseError(e, "Can't create instance of map output class: " + mapOutClass));
    }

    conf.set(MAP_OUT_CLASS, mapOutClass.getName());
    conf.set(MAP_IN_CLASS, mapInClass.getName());
    // XXX validation!
    if (proto != null) {
        conf.set(AvroJob.INPUT_SCHEMA, getSchema(proto).toString());
    } else if (mapin != null) {
        conf.set(AvroJob.INPUT_SCHEMA, mapin.toString());
    } else {
        errors.add(new PhaseError("No map input defined"));
    }

    if (groupBy != null || sortBy != null) {
        conf.set(MAP_OUT_KEY_SCHEMA, group(mapValueSchema, groupBy, sortBy).toString());
    }
    if (groupBy != null) {
        conf.set(GROUP_BY, groupBy);
        AvroJob.setOutputMeta(conf, GROUP_BY, groupBy);
    }
    if (sortBy != null) {
        conf.setPartitionerClass(AvroGroupPartitioner.class);
        conf.set(SORT_BY, sortBy);
        AvroJob.setOutputMeta(conf, SORT_BY, sortBy);
    }

    conf.setMapOutputKeyClass(AvroKey.class);
    conf.setMapOutputValueClass(AvroValue.class);
    conf.setOutputKeyComparatorClass(ColKeyComparator.class);

    conf.setMapperClass(ColHadoopMapper.class);
    conf.setReducerClass(ColHadoopReducer.class);

    for (Map.Entry<String, String> entry : textMeta.entrySet())
        AvroJob.setOutputMeta(conf, entry.getKey(), entry.getValue());

    // add ColAvroSerialization to io.serializations
    Collection<String> serializations = conf.getStringCollection("io.serializations");
    if (!serializations.contains(ColAvroSerialization.class.getName())) {
        serializations.add(ColAvroSerialization.class.getName());
        conf.setStrings("io.serializations", serializations.toArray(new String[0]));
    }
    return errors;
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileJob.java

License:Apache License

/**
 * The driver for the MapReduce job.// w w w.  j a  v  a 2 s.c o  m
 *
 * @param conf           configuration
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws java.io.IOException         if something went wrong
 * @throws java.net.URISyntaxException if a URI wasn't correctly formed
 */
public boolean runJob(final Configuration conf, final String inputDirAsString, final String outputDirAsString)
        throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {

    JobConf job = new JobConf(conf);

    job.setJarByClass(CombineSequenceFileJob.class);
    job.setJobName("seqfilecombiner");

    job.setNumReduceTasks(0);

    job.setMapperClass(IdentityMapper.class);

    job.setInputFormat(CombineSequenceFileInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, inputDirAsString);
    FileOutputFormat.setOutputPath(job, new Path(outputDirAsString));

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    RunningJob jobResult = JobClient.runJob(job);

    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    return jobResult.isSuccessful();
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileTest.java

License:Apache License

@Test
public void testOneFile() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile = new Path(dir, "file1.txt");

    writeSequenceFile(inputFile);/*w w w  . jav  a  2 s. com*/

    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);

    FileInputFormat.addInputPath(jobConf, inputFile);

    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    CommonCombineRecordReader<Text, Text> rr = (CommonCombineRecordReader<Text, Text>) inputFormat
            .getRecordReader(splits[0], jobConf, new DummyReporter());
    Text k = new Text();
    Text v = new Text();
    assertTrue(rr.next(k, v));

    assertEquals(key, k);
    assertEquals(value, v);

    assertFalse(rr.next(k, v));
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileTest.java

License:Apache License

@Test
public void testTwoFiles() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile1 = new Path(dir, "file1.txt");
    Path inputFile2 = new Path(dir, "file2.txt");

    writeSequenceFile(inputFile1);/* ww w .j  a v a2s .  com*/
    writeSequenceFile(inputFile2);

    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);

    FileInputFormat.addInputPath(jobConf, inputFile1);
    FileInputFormat.addInputPath(jobConf, inputFile2);

    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    CommonCombineRecordReader<Text, Text> rr = (CommonCombineRecordReader<Text, Text>) inputFormat
            .getRecordReader(splits[0], jobConf, new DummyReporter());
    Text k = new Text();
    Text v = new Text();

    assertTrue(rr.next(k, v));

    assertEquals(key, k);
    assertEquals(value, v);

    assertEquals(0.5f, rr.getProgress(), 0.1);

    assertTrue(rr.next(k, v));

    assertEquals(key, k);
    assertEquals(value, v);

    assertFalse(rr.next(k, v));
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for sort program which works with command-line arguments.
 *
 * @param args command-line arguments// w  w w  .ja  v  a 2 s. co  m
 * @return 0 if everything went well, non-zero for everything else
 * @throws Exception When there is communication problems with the
 *                   job tracker.
 */
@SuppressWarnings("unchecked")
public int run(final String[] args) throws Exception {

    SortConfig sortConfig = new SortConfig(getConf());

    Integer numMapTasks = null;
    Integer numReduceTasks = null;

    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    Class<? extends CompressionCodec> codecClass = null;
    Class<? extends CompressionCodec> mapCodecClass = null;
    boolean createLzopIndex = false;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                numMapTasks = Integer.parseInt(args[++i]);
            } else if ("-r".equals(args[i])) {
                numReduceTasks = Integer.parseInt(args[++i]);
            } else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) {
                sortConfig.setIgnoreCase(true);
            } else if ("-u".equals(args[i]) || "--unique".equals(args[i])) {
                sortConfig.setUnique(true);
            } else if ("-k".equals(args[i]) || "--key".equals(args[i])) {
                String[] parts = StringUtils.split(args[++i], ",");
                sortConfig.setStartKey(Integer.valueOf(parts[0]));
                if (parts.length > 1) {
                    sortConfig.setEndKey(Integer.valueOf(parts[1]));
                }
            } else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) {
                sortConfig.setFieldSeparator(args[++i]);
            } else if ("--total-order".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits) {
                    maxSplits = Integer.MAX_VALUE;
                }
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else if ("--map-codec".equals(args[i])) {
                mapCodecClass = (Class<? extends CompressionCodec>) Class.forName(args[++i]);
            } else if ("--codec".equals(args[i])) {
                codecClass = (Class<? extends CompressionCodec>) Class.forName(args[++i]);
            } else if ("--lzop-index".equals(args[i])) {
                createLzopIndex = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }

    if (runJob(new JobConf(sortConfig.getConfig()), numMapTasks, numReduceTasks, sampler, codecClass,
            mapCodecClass, createLzopIndex, otherArgs.get(0), otherArgs.get(1))) {
        return 0;
    }
    return 1;
}

From source file:com.alexholmes.hadooputils.sort.SortTest.java

License:Apache License

public void run(TextIOJobBuilder builder) throws Exception {
    run(new JobConf(new SortConfig(builder.getFs().getConf()).getConfig()), builder, 1, 1, null);
}

From source file:com.alexholmes.hadooputils.sort.SortTest.java

License:Apache License

public void run(SortConfig sortConfig, TextIOJobBuilder builder) throws Exception {
    run(new JobConf(sortConfig.getConfig()), builder, 1, 1, null);
}