Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:co.cask.cdap.explore.guice.ExploreRuntimeModule.java

License:Apache License

private static void setupClasspath(File tmpDir) throws IOException {
    // Here we find the transitive dependencies and remove all paths that come from the boot class path -
    // those paths are not needed because the new JVM will have them in its boot class path.
    // It could even be wrong to keep them because in the target container, the boot class path may be different
    // (for example, if Hadoop uses a different Java version than CDAP).

    final Set<String> bootstrapClassPaths = ExploreServiceUtils.getBoostrapClasses();

    ClassAcceptor classAcceptor = new ClassAcceptor() {
        /* Excluding any class contained in the bootstrapClassPaths and Kryo classes and hive-exec.jar
         * We need to remove Kryo dependency in the Explore container. Spark introduced version 2.21 version of Kryo,
         * which would be normally shipped to the Explore container. Yet, Hive requires Kryo 2.22,
         * and gets it from the Hive jars - hive-exec.jar to be precise.
         * we also exclude hive jars as hive dependencies are found in job.jar.
         * *///from  w w w.j  a  v a  2 s.  c  o m
        @Override
        public boolean accept(String className, URL classUrl, URL classPathUrl) {
            if (bootstrapClassPaths.contains(classPathUrl.getFile())
                    || className.startsWith("com.esotericsoftware.kryo")
                    || classPathUrl.getFile().contains("hive")) {
                return false;
            }
            return true;
        }
    };

    Set<File> hBaseTableDeps = ExploreServiceUtils.traceDependencies(null, classAcceptor, tmpDir,
            HBaseTableUtilFactory.getHBaseTableUtilClass().getName());

    // Note the order of dependency jars is important so that HBase jars come first in the classpath order
    // LinkedHashSet maintains insertion order while removing duplicate entries.
    Set<File> orderedDependencies = new LinkedHashSet<>();
    orderedDependencies.addAll(hBaseTableDeps);
    orderedDependencies.addAll(ExploreServiceUtils.traceDependencies(null, classAcceptor, tmpDir,
            RemoteDatasetFramework.class.getName(), DatasetStorageHandler.class.getName(),
            RecordFormats.class.getName()));

    // Note: the class path entries need to be prefixed with "file://" for the jars to work when
    // Hive starts local map-reduce job.
    ImmutableList.Builder<String> builder = ImmutableList.builder();
    for (File dep : orderedDependencies) {
        builder.add("file://" + dep.getAbsolutePath());
    }
    List<String> orderedDependenciesStr = builder.build();

    // These dependency files need to be copied over to spark container
    System.setProperty(BaseHiveExploreService.SPARK_YARN_DIST_FILES,
            Joiner.on(',').join(Iterables.transform(orderedDependencies, new Function<File, String>() {
                @Override
                public String apply(File input) {
                    return input.getAbsolutePath();
                }
            })));
    LOG.debug("Setting {} to {}", BaseHiveExploreService.SPARK_YARN_DIST_FILES,
            System.getProperty(BaseHiveExploreService.SPARK_YARN_DIST_FILES));

    // These dependency files need to be copied over to hive job container
    System.setProperty(HiveConf.ConfVars.HIVEAUXJARS.toString(), Joiner.on(',').join(orderedDependenciesStr));
    LOG.debug("Setting {} to {}", HiveConf.ConfVars.HIVEAUXJARS.toString(),
            System.getProperty(HiveConf.ConfVars.HIVEAUXJARS.toString()));

    // add hive-exec.jar to the HADOOP_CLASSPATH, which is used by the local mapreduce job launched by hive ,
    // we need to add this, otherwise when hive runs a MapRedLocalTask it cannot find
    // "org.apache.hadoop.hive.serde2.SerDe" class in its classpath.
    List<String> orderedDependenciesWithHiveJar = Lists.newArrayList(orderedDependenciesStr);
    String hiveExecJar = new JobConf(org.apache.hadoop.hive.ql.exec.Task.class).getJar();
    Preconditions.checkNotNull(hiveExecJar, "Couldn't locate hive-exec.jar to be included in HADOOP_CLASSPATH "
            + "for MapReduce jobs launched by Hive");
    orderedDependenciesWithHiveJar.add(hiveExecJar);
    LOG.debug("Added hive-exec.jar {} to HADOOP_CLASSPATH to be included for MapReduce jobs launched by Hive",
            hiveExecJar);

    //TODO: Setup HADOOP_CLASSPATH hack, more info on why this is needed, see CDAP-9
    LocalMapreduceClasspathSetter classpathSetter = new LocalMapreduceClasspathSetter(new HiveConf(),
            tmpDir.getAbsolutePath(), orderedDependenciesWithHiveJar);
    for (File jar : hBaseTableDeps) {
        classpathSetter.accept(jar.getAbsolutePath());
    }
    classpathSetter.setupClasspathScript();
}

From source file:co.nubetech.hiho.job.DBQueryInputJob.java

License:Apache License

public void runJobs(Configuration conf, int jobCounter) throws IOException {

    try {/*from w w  w .j  a v a 2 s .  c om*/
        checkMandatoryConfs(conf);
    } catch (HIHOException e1) {
        e1.printStackTrace();
        throw new IOException(e1);
    }

    Job job = new Job(conf);
    for (Entry<String, String> entry : conf) {
        logger.warn("key, value " + entry.getKey() + "=" + entry.getValue());
    }

    // logger.debug("Number of maps " +
    // conf.getInt("mapred.map.tasks", 1));
    // conf.setInt(JobContext.NUM_MAPS,
    // conf.getInt("mapreduce.job.maps", 1));
    // job.getConfiguration().setInt("mapred.map.tasks", 4);
    job.getConfiguration().setInt(MRJobConfig.NUM_MAPS, conf.getInt(HIHOConf.NUMBER_MAPPERS, 1));
    logger.warn("Number of maps " + conf.getInt(MRJobConfig.NUM_MAPS, 1));

    job.setJobName("Import job");
    job.setJarByClass(DBQueryInputJob.class);

    String strategy = conf.get(HIHOConf.INPUT_OUTPUT_STRATEGY);
    OutputStrategyEnum os = OutputStrategyEnum.value(strategy);
    if (os == null) {
        throw new IllegalArgumentException("Wrong value of output strategy. Please correct");
    }
    if (os != OutputStrategyEnum.AVRO) {
        switch (os) {

        case DUMP: {
            // job.setMapperClass(DBImportMapper.class);
            break;
        }
        /*
         * case AVRO: { job.setMapperClass(DBInputAvroMapper.class); //
         * need avro in cp // job.setJarByClass(Schema.class); // need
         * jackson which is needed by avro - ugly! //
         * job.setJarByClass(ObjectMapper.class);
         * job.setMapOutputKeyClass(NullWritable.class);
         * job.setMapOutputValueClass(AvroValue.class);
         * job.setOutputKeyClass(NullWritable.class);
         * job.setOutputValueClass(AvroValue.class);
         * job.setOutputFormatClass(AvroOutputFormat.class);
         * 
         * AvroOutputFormat.setOutputPath(job, new
         * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH))); break; }
         */
        case DELIMITED: {
            job.setMapperClass(DBInputDelimMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class);

            NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
        }
        case JSON: {
            // job.setMapperClass(DBImportJsonMapper.class);
            // job.setJarByClass(ObjectMapper.class);
            break;
        }
        default: {
            job.setMapperClass(DBInputDelimMapper.class);
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            job.setOutputFormatClass(NoKeyOnlyValueOutputFormat.class);

            NoKeyOnlyValueOutputFormat.setOutputPath(job, new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
            break;
        }
        }

        String inputQuery = conf.get(DBConfiguration.INPUT_QUERY);
        String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY);
        logger.debug("About to set the params");
        DBQueryInputFormat.setInput(job, inputQuery, inputBoundingQuery, params);
        logger.debug("Set the params");

        job.setNumReduceTasks(0);

        try {
            // job.setJarByClass(Class.forName(conf.get(
            // org.apache.hadoop.mapred.lib.db.DBConfiguration.DRIVER_CLASS_PROPERTY)));
            logger.debug("OUTPUT format class is " + job.getOutputFormatClass());

            /*
             * org.apache.hadoop.mapreduce.OutputFormat<?, ?> output =
             * ReflectionUtils.newInstance(job.getOutputFormatClass(),
             * job.getConfiguration()); output.checkOutputSpecs(job);
             */
            logger.debug("Class is " + ReflectionUtils
                    .newInstance(job.getOutputFormatClass(), job.getConfiguration()).getClass().getName());
            job.waitForCompletion(false);
            if (conf.get(HIHOConf.INPUT_OUTPUT_LOADTO) != null) {
                generateHiveScript(conf, job, jobCounter);
                generatePigScript(conf, job);
            }

        }
        /*
         * catch (HIHOException h) { h.printStackTrace(); }
         */
        catch (Exception e) {
            e.printStackTrace();
        } catch (HIHOException e) {
            e.printStackTrace();
        }
    }
    // avro to be handled differently, thanks to all the incompatibilities
    // in the apis.
    else {
        String inputQuery = conf.get(DBConfiguration.INPUT_QUERY);
        String inputBoundingQuery = conf.get(DBConfiguration.INPUT_BOUNDING_QUERY);
        logger.debug("About to set the params");
        // co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(job,
        // inputQuery, inputBoundingQuery, params);
        logger.debug("Set the params");

        JobConf jobConf = new JobConf(conf);

        try {
            GenericDBWritable queryWritable = getDBWritable(jobConf);
            Schema pair = DBMapper.getPairSchema(queryWritable.getColumns());

            AvroJob.setMapOutputSchema(jobConf, pair);
            GenericRecordAvroOutputFormat.setOutputPath(jobConf,
                    new Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));

            co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.setInput(jobConf, inputQuery,
                    inputBoundingQuery, params);
            jobConf.setInputFormat(co.nubetech.apache.hadoop.mapred.DBQueryInputFormat.class);
            jobConf.setMapperClass(DBInputAvroMapper.class);
            jobConf.setMapOutputKeyClass(NullWritable.class);
            jobConf.setMapOutputValueClass(AvroValue.class);
            jobConf.setOutputKeyClass(NullWritable.class);
            jobConf.setOutputValueClass(Text.class);
            jobConf.setOutputFormat(GenericRecordAvroOutputFormat.class);
            jobConf.setJarByClass(DBQueryInputJob.class);
            jobConf.setStrings("io.serializations",
                    "org.apache.hadoop.io.serializer.JavaSerialization,org.apache.hadoop.io.serializer.WritableSerialization,org.apache.avro.mapred.AvroSerialization");
            jobConf.setNumReduceTasks(0);
            /*
             * jobConf.setOutputFormat(org.apache.hadoop.mapred.
             * SequenceFileOutputFormat.class);
             * org.apache.hadoop.mapred.SequenceFileOutputFormat
             * .setOutputPath(jobConf, new
             * Path(getConf().get(HIHOConf.INPUT_OUTPUT_PATH)));
             */
            JobClient.runJob(jobConf);
        } catch (Throwable e) {
            e.printStackTrace();
        }

    }

}

From source file:co.nubetech.hiho.mapreduce.lib.input.TestFileStreamInputFormat.java

License:Apache License

@Test
public void testNumInputs() throws Exception {
    Configuration conf = new Configuration();
    JobConf job = new JobConf(conf);
    MiniDFSCluster dfs = newDFSCluster(job);
    FileSystem fs = dfs.getFileSystem();
    System.out.println("FileSystem " + fs.getUri());
    Path inputDir = new Path("/foo/");
    final int numFiles = 10;
    String fileNameBase = "part-0000";

}

From source file:colossal.pipe.ColPhase.java

License:Apache License

public List<PhaseError> plan(ColPipe distPipeline) {
    List<PhaseError> errors = new ArrayList<PhaseError>();
    conf = new JobConf(distPipeline.getConf());
    for (Map.Entry<String, String> entry : props.entrySet()) {
        conf.set(entry.getKey(), entry.getValue());
    }/* www.  ja v  a2s  .c o  m*/

    Schema mapin = null;
    Class<?> mapOutClass = null;
    Class<?> mapInClass = null;

    Class<? extends ColMapper> mapperClass = null;
    if (mappers != null && mappers.length > 0) {
        if (mappers.length > 1) {
            errors.add(new PhaseError(
                    "Colossal phase/avro currently only supports one mapper per process: " + name));
        } else {
            mapperClass = mappers[0];
            conf.set(MAPPER, mapperClass.getName());
            Class<?> foundIn = null;
            for (Method m : mapperClass.getMethods()) {
                if ("map".equals(m.getName())) {
                    Class<?>[] paramTypes = m.getParameterTypes();
                    if (paramTypes.length >= 3) {
                        try {
                            // prefer subclass methods to superclass methods
                            if (foundIn == null || foundIn.isAssignableFrom(m.getDeclaringClass())) {
                                if (paramTypes[0] == Object.class) {
                                    if (foundIn == m.getDeclaringClass()) {
                                        // skip the generated "override" of the generic method
                                        continue;
                                    }
                                } else {
                                    //TODO: handle cases beyond Object where output isn't defined    
                                    mapInClass = paramTypes[0];
                                    mapin = getSchema(paramTypes[0].newInstance());
                                }
                                mapOutClass = paramTypes[1];
                                foundIn = m.getDeclaringClass();
                            }
                        } catch (Exception e) {
                            errors.add(new PhaseError(e, "Can't create mapper: " + mapperClass));
                        }
                    }
                }
            }
        }
    }

    if (combiners != null && combiners.length > 0) {
        if (combiners.length > 1) {
            errors.add(new PhaseError(
                    "Colossal phase/avro currently only supports one combiner per process: " + name));
        } else {
            conf.set(COMBINER, combiners[0].getName());
            conf.setCombinerClass(ColHadoopCombiner.class);
        }
    }
    Schema reduceout = null;
    Class<?> reduceOutClass = null;
    Class<? extends ColReducer> reducerClass = null;
    if (reducers != null && reducers.length > 0) {
        if (reducers.length != 1) {
            errors.add(new PhaseError(
                    "Colossal phase/avro currently only supports one reducer per process: " + name));
        } else {
            reducerClass = reducers[0];
            conf.set(REDUCER, reducers[0].getName());
            Class<?> foundIn = null;
            for (Method m : reducerClass.getMethods()) {
                if ("reduce".equals(m.getName())) {
                    Class<?>[] paramTypes = m.getParameterTypes();
                    if (paramTypes.length >= 3) {
                        if (foundIn == null || foundIn.isAssignableFrom(m.getDeclaringClass())) {
                            if (foundIn == m.getDeclaringClass() && paramTypes[1] == Object.class) {
                                // skip the generated "override" of the generic method
                                continue;
                            }
                            // prefer subclass methods to superclass methods
                            reduceOutClass = paramTypes[1];
                            foundIn = m.getDeclaringClass();
                        }
                    }
                }
            }
            // XXX validation!
        }
    }
    Object reduceOutProto = null;
    //TODO: handle cases beyond Object where output isn't defined
    if ((reduceOutClass == null || reduceOutClass == Object.class) && mainWrites != null
            && mainWrites.size() > 0) {
        reduceOutProto = mainWrites.get(0).getPrototype();
        reduceOutClass = reduceOutProto.getClass();
    } else {
        try {
            reduceOutProto = reduceOutClass.newInstance();
        } catch (Exception e) {
            errors.add(new PhaseError(e, "Can't create reducer output class: " + reduceOutClass));
        }
    }
    if (reduceOutProto != null)
        reduceout = getSchema(reduceOutProto);

    conf.set(REDUCE_OUT_CLASS, reduceOutClass.getName());

    Schema valueSchema = null;
    if (mainWrites.size() != 1) {
        errors.add(
                new PhaseError("Colossal phase/avro currently only supports one output per process: " + name));
    } else {
        ColFile output = mainWrites.get(0);
        AvroOutputFormat.setOutputPath(conf, new Path(output.getPath()));

        if (output.getPrototype() != null) {
            valueSchema = getSchema(output.getPrototype());
            if (reduceout != null) {
                assert reduceout.equals(valueSchema); // should make an error not assert this!
            }
        } else {
            if (reduceout == null) {
                errors.add(new PhaseError("No output format defined"));
            }
            valueSchema = reduceout;
        }
        output.setupOutput(conf);
    }
    conf.set(AvroJob.OUTPUT_SCHEMA, valueSchema.toString());

    if (deflateLevel != null)
        AvroOutputFormat.setDeflateLevel(conf, deflateLevel);

    Object proto = null;
    if (mainReads != null && mainReads.size() > 0) {
        Path[] inPaths = new Path[mainReads.size()];
        int i = 0;
        for (ColFile file : mainReads) {
            inPaths[i++] = new Path(file.getPath());
            Object myProto = file.getPrototype();
            if (myProto == null) {
                errors.add(new PhaseError("Files need non-null prototypes " + file));
            } else if (proto != null) {
                if (myProto.getClass() != proto.getClass()) {
                    errors.add(new PhaseError("Inconsistent prototype classes for inputs: " + myProto.getClass()
                            + " vs " + proto.getClass() + " for " + file));
                }
            } else {
                proto = myProto;
            }
        }
        AvroInputFormat.setInputPaths(conf, inPaths);

        if (mapin == null) {
            if (proto == null) {
                errors.add(new PhaseError("Undefined input format"));
            } else {
                mapin = getSchema(proto);
                mapInClass = proto.getClass();
            }
        }
        mainReads.get(0).setupInput(conf);
        if (conf.get("mapred.input.format.class") == null)
            conf.setInputFormat(AvroInputFormat.class);
    }

    Schema mapValueSchema = null;
    try {
        //TODO: handle cases beyond Object where input isn't defined
        if (mapOutClass == null || mapOutClass == Object.class) {
            assert mapperClass == null;
            if (proto != null) {
                mapOutClass = proto.getClass();
                mapValueSchema = getSchema(proto);
            } else {
                // not available - try to get it from the reducer
                if (reducerClass == null) {
                    mapOutClass = reduceOutClass;
                    mapValueSchema = getSchema(reduceOutClass.newInstance());
                } else {
                    // can't get it from reducer input - that's just Iterable
                    String fname = "no input file specified";
                    if (mainReads != null && mainReads.size() > 0)
                        fname = mainReads.get(0).getPath();
                    errors.add(new PhaseError(
                            "No input format specified for identity mapper - specify it on input file "
                                    + fname));
                }
            }
        } else {
            mapValueSchema = getSchema(mapOutClass.newInstance());
        }
        if (mapValueSchema != null)
            conf.set(MAP_OUT_VALUE_SCHEMA, mapValueSchema.toString());
    } catch (Exception e) {
        errors.add(new PhaseError(e, "Can't create instance of map output class: " + mapOutClass));
    }

    conf.set(MAP_OUT_CLASS, mapOutClass.getName());
    conf.set(MAP_IN_CLASS, mapInClass.getName());
    // XXX validation!
    if (proto != null) {
        conf.set(AvroJob.INPUT_SCHEMA, getSchema(proto).toString());
    } else if (mapin != null) {
        conf.set(AvroJob.INPUT_SCHEMA, mapin.toString());
    } else {
        errors.add(new PhaseError("No map input defined"));
    }

    if (groupBy != null || sortBy != null) {
        conf.set(MAP_OUT_KEY_SCHEMA, group(mapValueSchema, groupBy, sortBy).toString());
    }
    if (groupBy != null) {
        conf.set(GROUP_BY, groupBy);
        AvroJob.setOutputMeta(conf, GROUP_BY, groupBy);
    }
    if (sortBy != null) {
        conf.setPartitionerClass(AvroGroupPartitioner.class);
        conf.set(SORT_BY, sortBy);
        AvroJob.setOutputMeta(conf, SORT_BY, sortBy);
    }

    conf.setMapOutputKeyClass(AvroKey.class);
    conf.setMapOutputValueClass(AvroValue.class);
    conf.setOutputKeyComparatorClass(ColKeyComparator.class);

    conf.setMapperClass(ColHadoopMapper.class);
    conf.setReducerClass(ColHadoopReducer.class);

    for (Map.Entry<String, String> entry : textMeta.entrySet())
        AvroJob.setOutputMeta(conf, entry.getKey(), entry.getValue());

    // add ColAvroSerialization to io.serializations
    Collection<String> serializations = conf.getStringCollection("io.serializations");
    if (!serializations.contains(ColAvroSerialization.class.getName())) {
        serializations.add(ColAvroSerialization.class.getName());
        conf.setStrings("io.serializations", serializations.toArray(new String[0]));
    }
    return errors;
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileJob.java

License:Apache License

/**
 * The driver for the MapReduce job.// w w w.  j a  v  a 2 s.c o  m
 *
 * @param conf           configuration
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws java.io.IOException         if something went wrong
 * @throws java.net.URISyntaxException if a URI wasn't correctly formed
 */
public boolean runJob(final Configuration conf, final String inputDirAsString, final String outputDirAsString)
        throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {

    JobConf job = new JobConf(conf);

    job.setJarByClass(CombineSequenceFileJob.class);
    job.setJobName("seqfilecombiner");

    job.setNumReduceTasks(0);

    job.setMapperClass(IdentityMapper.class);

    job.setInputFormat(CombineSequenceFileInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, inputDirAsString);
    FileOutputFormat.setOutputPath(job, new Path(outputDirAsString));

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    RunningJob jobResult = JobClient.runJob(job);

    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    return jobResult.isSuccessful();
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileTest.java

License:Apache License

@Test
public void testOneFile() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile = new Path(dir, "file1.txt");

    writeSequenceFile(inputFile);/*w w w  . jav  a  2 s. com*/

    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);

    FileInputFormat.addInputPath(jobConf, inputFile);

    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    CommonCombineRecordReader<Text, Text> rr = (CommonCombineRecordReader<Text, Text>) inputFormat
            .getRecordReader(splits[0], jobConf, new DummyReporter());
    Text k = new Text();
    Text v = new Text();
    assertTrue(rr.next(k, v));

    assertEquals(key, k);
    assertEquals(value, v);

    assertFalse(rr.next(k, v));
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapred.CombineSequenceFileTest.java

License:Apache License

@Test
public void testTwoFiles() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile1 = new Path(dir, "file1.txt");
    Path inputFile2 = new Path(dir, "file2.txt");

    writeSequenceFile(inputFile1);/* ww w .j  a v a2s .  com*/
    writeSequenceFile(inputFile2);

    Configuration conf = new Configuration();
    JobConf jobConf = new JobConf(conf);

    FileInputFormat.addInputPath(jobConf, inputFile1);
    FileInputFormat.addInputPath(jobConf, inputFile2);

    InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
    assertEquals(1, splits.length);

    CommonCombineRecordReader<Text, Text> rr = (CommonCombineRecordReader<Text, Text>) inputFormat
            .getRecordReader(splits[0], jobConf, new DummyReporter());
    Text k = new Text();
    Text v = new Text();

    assertTrue(rr.next(k, v));

    assertEquals(key, k);
    assertEquals(value, v);

    assertEquals(0.5f, rr.getProgress(), 0.1);

    assertTrue(rr.next(k, v));

    assertEquals(key, k);
    assertEquals(value, v);

    assertFalse(rr.next(k, v));
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.sort.Sort.java

License:Apache License

/**
 * The driver for sort program which works with command-line arguments.
 *
 * @param args command-line arguments// w  w w  .ja  v  a 2 s. co  m
 * @return 0 if everything went well, non-zero for everything else
 * @throws Exception When there is communication problems with the
 *                   job tracker.
 */
@SuppressWarnings("unchecked")
public int run(final String[] args) throws Exception {

    SortConfig sortConfig = new SortConfig(getConf());

    Integer numMapTasks = null;
    Integer numReduceTasks = null;

    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    Class<? extends CompressionCodec> codecClass = null;
    Class<? extends CompressionCodec> mapCodecClass = null;
    boolean createLzopIndex = false;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                numMapTasks = Integer.parseInt(args[++i]);
            } else if ("-r".equals(args[i])) {
                numReduceTasks = Integer.parseInt(args[++i]);
            } else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) {
                sortConfig.setIgnoreCase(true);
            } else if ("-u".equals(args[i]) || "--unique".equals(args[i])) {
                sortConfig.setUnique(true);
            } else if ("-k".equals(args[i]) || "--key".equals(args[i])) {
                String[] parts = StringUtils.split(args[++i], ",");
                sortConfig.setStartKey(Integer.valueOf(parts[0]));
                if (parts.length > 1) {
                    sortConfig.setEndKey(Integer.valueOf(parts[1]));
                }
            } else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) {
                sortConfig.setFieldSeparator(args[++i]);
            } else if ("--total-order".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits) {
                    maxSplits = Integer.MAX_VALUE;
                }
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else if ("--map-codec".equals(args[i])) {
                mapCodecClass = (Class<? extends CompressionCodec>) Class.forName(args[++i]);
            } else if ("--codec".equals(args[i])) {
                codecClass = (Class<? extends CompressionCodec>) Class.forName(args[++i]);
            } else if ("--lzop-index".equals(args[i])) {
                createLzopIndex = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }

    if (runJob(new JobConf(sortConfig.getConfig()), numMapTasks, numReduceTasks, sampler, codecClass,
            mapCodecClass, createLzopIndex, otherArgs.get(0), otherArgs.get(1))) {
        return 0;
    }
    return 1;
}

From source file:com.alexholmes.hadooputils.sort.SortTest.java

License:Apache License

public void run(TextIOJobBuilder builder) throws Exception {
    run(new JobConf(new SortConfig(builder.getFs().getConf()).getConfig()), builder, 1, 1, null);
}

From source file:com.alexholmes.hadooputils.sort.SortTest.java

License:Apache License

public void run(SortConfig sortConfig, TextIOJobBuilder builder) throws Exception {
    run(new JobConf(sortConfig.getConfig()), builder, 1, 1, null);
}