Example usage for org.apache.hadoop.mapred JobConf setInputFormat

List of usage examples for org.apache.hadoop.mapred JobConf setInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInputFormat.

Prototype

public void setInputFormat(Class<? extends InputFormat> theClass) 

Source Link

Document

Set the InputFormat implementation for the map-reduce job.

Usage

From source file:cascading.jdbc.db.DBInputFormat.java

License:Apache License

/**
 * Initializes the map-part of the job with the appropriate input settings.
 *
 * @param job             The job/*  w ww.  j  a va2s . c  om*/
 * @param inputClass      the class object implementing DBWritable, which is the
 *                        Java object holding tuple fields.
 * @param tableName       The table to read data from
 * @param conditions      The condition which to select data with, eg. '(updated >
 *                        20070101 AND length > 0)'
 * @param orderBy         the fieldNames in the orderBy clause.
 * @param limit
 * @param fieldNames      The field names in the table
 * @param concurrentReads
 */
public static void setInput(JobConf job, Class<? extends DBWritable> inputClass, String tableName,
        String conditions, String orderBy, long limit, int concurrentReads, String... fieldNames) {
    job.setInputFormat(DBInputFormat.class);

    DBConfiguration dbConf = new DBConfiguration(job);

    dbConf.setInputClass(inputClass);
    dbConf.setInputTableName(tableName);
    dbConf.setInputFieldNames(fieldNames);
    dbConf.setInputConditions(conditions);
    dbConf.setInputOrderBy(orderBy);

    if (limit != -1)
        dbConf.setInputLimit(limit);

    dbConf.setMaxConcurrentReadsNum(concurrentReads);
}

From source file:cascading.jdbc.db.DBInputFormat.java

License:Apache License

/**
 * Initializes the map-part of the job with the appropriate input settings.
 *
 * @param job             The job//from   w  w w  .  j a va 2 s  .  c om
 * @param inputClass      the class object implementing DBWritable, which is the
 *                        Java object holding tuple fields.
 * @param selectQuery     the input query to select fields. Example :
 *                        "SELECT f1, f2, f3 FROM Mytable ORDER BY f1"
 * @param countQuery      the input query that returns the number of records in
 *                        the table.
 *                        Example : "SELECT COUNT(f1) FROM Mytable"
 * @param concurrentReads
 * @see #setInput(org.apache.hadoop.mapred.JobConf, Class, String, String, String, String...)
 */
public static void setInput(JobConf job, Class<? extends DBWritable> inputClass, String selectQuery,
        String countQuery, long limit, int concurrentReads) {
    job.setInputFormat(DBInputFormat.class);

    DBConfiguration dbConf = new DBConfiguration(job);

    dbConf.setInputClass(inputClass);
    dbConf.setInputQuery(selectQuery);
    dbConf.setInputCountQuery(countQuery);

    if (limit != -1)
        dbConf.setInputLimit(limit);

    dbConf.setMaxConcurrentReadsNum(concurrentReads);
}

From source file:cascading.scheme.SequenceFile.java

License:Open Source License

@Override
public void sourceInit(Tap tap, JobConf conf) {
    conf.setInputFormat(SequenceFileInputFormat.class);
}

From source file:cascading.scheme.TextLine.java

License:Open Source License

@Override
public void sourceInit(Tap tap, JobConf conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(conf)))
        conf.setInputFormat(ZipInputFormat.class);
    else/*  ww  w.  ja v a2s. c  o m*/
        conf.setInputFormat(TextInputFormat.class);
}

From source file:cascading.tap.hadoop.io.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob/*from  w w  w . j a va 2s. c  om*/
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        if (fromJob.get("mapred.input.format.class") == null)
            throw new CascadingException(
                    "mapred.input.format.class is required, should be set in source Scheme#sourceConfInit");

        configs.add(HadoopUtil.getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = HadoopUtil.isLocal(fromJob);
    }

    if (!allPaths.isEmpty()) // it's possible there aren't any
        FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", HadoopUtil.serializeBase64(configs, toJob, true));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        HadoopUtil.setLocal(toJob);
}

From source file:cascading.tap.hadoop.MultiInputFormat.java

License:Open Source License

/**
 * Used to set the current JobConf with all sub jobs configurations.
 *
 * @param toJob//w  w  w.  ja  v  a  2 s .c o  m
 * @param fromJobs
 */
public static void addInputFormat(JobConf toJob, JobConf... fromJobs) {
    toJob.setInputFormat(MultiInputFormat.class);
    List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
    List<Path> allPaths = new ArrayList<Path>();

    boolean isLocal = false;

    for (JobConf fromJob : fromJobs) {
        configs.add(getConfig(toJob, fromJob));
        Collections.addAll(allPaths, FileInputFormat.getInputPaths(fromJob));

        if (!isLocal)
            isLocal = fromJob.get("mapred.job.tracker").equalsIgnoreCase("local");
    }

    FileInputFormat.setInputPaths(toJob, (Path[]) allPaths.toArray(new Path[allPaths.size()]));

    try {
        toJob.set("cascading.multiinputformats", Util.serializeBase64(configs));
    } catch (IOException exception) {
        throw new CascadingException("unable to pack input formats", exception);
    }

    if (isLocal)
        toJob.set("mapred.job.tracker", "local");
}

From source file:clusteringblocks.ClusteringBlocks.java

public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    JobConf job = new JobConf(conf, ClusteringBlocks.class);

    Path in = new Path(args[0]);
    Path out = new Path(args[1]);
    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);

    job.setJobName("ClusteringBlocks");
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormat(KeyValueTextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    //   job.set("key.value.separator.in.input.line", "");

    JobClient.runJob(job);/*from   w w  w  . ja v a 2 s .  c  om*/

    return 0;
}

From source file:cn.edu.hfut.dmic.webcollectorcluster.fetcher.Fetcher.java

@Override
public int run(String[] args) throws Exception {
    JobConf jc = new JobConf(getConf());
    jc.setJarByClass(Fetcher.class);
    jc.setInputFormat(SequenceFileInputFormat.class);
    Path input = new Path(args[0], "current");
    Path output = new Path(args[1]);
    Configuration conf = CrawlerConfiguration.create();
    FileSystem fs = output.getFileSystem(conf);
    if (fs.exists(output)) {
        fs.delete(output);//from w  w  w . j  a  v  a2  s .  co  m
    }
    FileInputFormat.addInputPath(jc, input);
    FileOutputFormat.setOutputPath(jc, output);

    jc.setMapOutputKeyClass(Text.class);
    jc.setMapOutputValueClass(WebWritable.class);

    jc.setMapRunnerClass(Fetcher.class);
    jc.setOutputFormat(FetcherOutputFormat.class);

    JobClient.runJob(jc);
    return 0;
}

From source file:cn.edu.xmu.dm.mapreduce.MultiFileWordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 2) {
        printUsage();/*www  .  j a  v  a2 s . c o  m*/
        return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    // set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(LongWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
}

From source file:cn.edu.xmu.dm.mapreduce.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from w ww  .j  av a 2s  . c  o m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "Sorter");
    job.setJarByClass(Sort.class);

    JobConf jobConf = new JobConf(getConf(), Sort.class);
    jobConf.setJobName("sorter");

    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return 0;
}