public static void setInputPathFilter(Job job, Class<? extends PathFilter> filter) 

Set a PathFilter to be applied to the input paths for the map-reduce job.


From source file:co.cask.cdap.template.etl.batch.source.FileBatchSource.java

License:Apache License

public void prepareRun(BatchSourceContext context) throws Exception {
    //SimpleDateFormat needs to be local because it is not threadsafe
    SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH");

    //calculate date one hour ago, rounded down to the nearest hour
    prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1));
    Calendar cal = Calendar.getInstance();
    cal.setTime(prevHour);//from   ww w .ja v  a 2s  .  c o m
    cal.set(Calendar.MINUTE, 0);
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    prevHour = cal.getTime();

    Job job = context.getHadoopJob();
    Configuration conf = job.getConfiguration();
    if (config.fileSystemProperties != null) {
        Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE);
        for (Map.Entry<String, String> entry : properties.entrySet()) {
            conf.set(entry.getKey(), entry.getValue());

    if (config.fileRegex != null) {
        conf.set(INPUT_REGEX_CONFIG, config.fileRegex);
    conf.set(INPUT_NAME_CONFIG, config.path);

    if (config.timeTable != null) {
        table = context.getDataset(config.timeTable);
        datesToRead = Bytes.toString(table.read(LAST_TIME_READ));
        if (datesToRead == null) {
            List<Date> firstRun = Lists.newArrayList(new Date(0));
            datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE);
        List<Date> attempted = Lists.newArrayList(prevHour);
        String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE);
        if (!updatedDatesToRead.equals(datesToRead)) {
            table.write(LAST_TIME_READ, updatedDatesToRead);
        conf.set(LAST_TIME_READ, datesToRead);

    conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour));
    if (!Strings.isNullOrEmpty(config.inputFormatClass)) {
        ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
        Class<? extends FileInputFormat> classType = (Class<? extends FileInputFormat>) classLoader
    } else {
    FileInputFormat.setInputPathFilter(job, BatchFileFilter.class);
    FileInputFormat.addInputPath(job, new Path(config.path));
    long maxSplitSize;
    try {
        maxSplitSize = Long.parseLong(config.maxSplitSize);
    } catch (NumberFormatException e) {
        maxSplitSize = DEFAULT_SPLIT_SIZE;
    CombineTextInputFormat.setMaxInputSplitSize(job, maxSplitSize);

From source file:com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJob.java

License:Apache License

 * Creates a job which using a temporary staging location for the output data.
 * The data is only copied to the final output directory on successful completion
 * of the job.  This prevents existing output data from being overwritten unless
 * the job completes successfully./*from   w  w w  .ja  va  2s .c o  m*/
 * @param props Job properties
 * @param jobName Name of the job
 * @param inputPaths Input paths job will be reading from
 * @param stagingLocation Where output of job should be staged
 * @param outputPath The final output location for the data
 * @param log The logger
 * @return The job
public static StagedOutputJob createStagedJob(Properties props, String jobName, List<String> inputPaths,
        String stagingLocation, String outputPath, final Logger log) {
    Configuration config = createConfigurationFromProps(props);

    final StagedOutputJob retVal;
    try {
        retVal = new StagedOutputJob(config, stagingLocation, log);

        FileInputFormat.setInputPathFilter(retVal, HiddenFilePathFilter.class);

    } catch (IOException e) {
        log.error("IOException when making a job, wtf?", e);
        throw new RuntimeException(e);

    try {
        FileInputFormat.setInputPaths(retVal, StringUtils.join(inputPaths.iterator(), ","));
    } catch (IOException e) {
        log.error("Unable to set up input paths.", e);
        throw new RuntimeException(e);

    FileOutputFormat.setOutputPath(retVal, new Path(outputPath));

    return retVal;

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounter.java

License:Apache License

 * @param args/*from  w w w  .j  a  v a  2  s.  c  om*/
 * @throws IOException
 * @throws Exception
public static void main(String[] args) throws IOException, Exception {

    String inputPath = null;
    String outputPath = null;
    String master = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments.
    for (int i = 0; i < args.length; i++) {
        try {
            if (args[i].equals(ARGNAME_INPATH)) {
                inputPath = args[++i];
            } else if (args[i].equals(ARGNAME_OUTPATH)) {
                outputPath = args[++i];
            } else if (args[i].equals(ARGNAME_MASTER)) {
                master = args[++i];
            } else if (args[i].equals(ARGNAME_S3ACCESSKEY)) {
                s3AccessKey = args[++i];
            } else if (args[i].equals(ARGNAME_S3SECRETKEY)) {
                s3SecretKey = args[++i];
            } else if (args[i].equals(ARGNAME_MAXFILES)) {
            } else if (args[i].equals(ARGNAME_OVERWRITE)) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[i]);
        } catch (ArrayIndexOutOfBoundsException e) {
            throw new IllegalArgumentException();
    LOG.info(" inputPath :" + inputPath);
    if (inputPath == null || outputPath == null || master == null) {
        throw new IllegalArgumentException();

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();

    SparkConf sparkConf = new SparkConf().setAppName("GoogleAdsCounter").setMaster(master);

    JavaSparkContext sc = new JavaSparkContext(sparkConf);

    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);

    //define the accumulators to count total response pages and total Google Ad Pages
    final Accumulator<Integer> totalResponsePagesAccumulator = sc.accumulator(0);
    final Accumulator<Integer> totalGoogleAdPagesAccumulator = sc.accumulator(0);

    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    JavaPairRDD<LongWritable, WARCWritable> records = sc.newAPIHadoopFile(inputPath, WARCInputFormat.class,
            LongWritable.class, WARCWritable.class, job.getConfiguration());

    JavaPairRDD<String, Integer> warcRecords = records
            .mapToPair(new PairFunction<Tuple2<LongWritable, WARCWritable>, String, Integer>() {

                public Tuple2<String, Integer> call(Tuple2<LongWritable, WARCWritable> record)
                        throws Exception {

                    String recordType = record._2().getRecord().getHeader().getRecordType();

                    String adType = null;
                    if (recordType.equals("response")) {

                        totalResponsePagesAccumulator.add(1); // total response pages

                        String recordContent = new String(record._2().getRecord().getContent());

                        // parse Html content of web page using Jsoup
                        Document doc = Jsoup.parse(recordContent);

                        // Get the <script> tag elements 
                        Elements scriptElements = doc.getElementsByTag("script");

                        for (Element element : scriptElements) {

                            // if web page has google ads, then <script> tag contains "google_ad_client" 
                            if (element.data().contains("google_ad_client")) {


                                GoogleAdParser parser = new DefaultParser(element.data());

                                String siteUrl = record._2().getRecord().getHeader().getTargetURI();
                                String title = "Default"; // FIXME

                                String adClient = parser.getAttribute("google_ad_client") != null
                                        ? parser.getAttribute("google_ad_client")
                                        : "NA";
                                String adSlot = "default"; // FIXME
                                String width = parser.getAttribute("google_ad_width") != null
                                        ? parser.getAttribute("google_ad_width")
                                        : "NA";
                                String height = parser.getAttribute("google_ad_height") != null
                                        ? parser.getAttribute("google_ad_height")
                                        : "NA";
                                adType = parser.getAttribute("google_ad_type") != null
                                        ? parser.getAttribute("google_ad_type")
                                        : "text";
                        return new Tuple2<String, Integer>(adType, 1);
                    } else
                        return new Tuple2<String, Integer>(adType, 1);


    JavaPairRDD<String, Integer> adTypeCounts = warcRecords
            .reduceByKey(new Function2<Integer, Integer, Integer>() {
                public Integer call(Integer i1, Integer i2) {
                    return i1 + i2;

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);

    long startTime = System.currentTimeMillis();

    //writing output to file
    adTypeCounts.saveAsNewAPIHadoopFile(outputPath, org.apache.hadoop.io.Text.class,
            org.apache.hadoop.io.Text.class, org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);

    //print accumulator values      
    LOG.info(" totalResponsePagesAccumulator value : " + totalResponsePagesAccumulator.value());
    LOG.info(" totalGoogleAdPagesAccumulator value : " + totalGoogleAdPagesAccumulator.value());
    long endTime = System.currentTimeMillis();
    long difference = endTime - startTime;
    LOG.info("Elapsed milliseconds: " + difference);

    //stop spark context

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java

License:Apache License

 * Configures and submits the Map Reduce Job to Hadoop
 *//* w ww  .ja  v  a2 s. c om*/
public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int index = 0; index < args.length; index++) {
        try {

            if (ARGNAME_INPATH.equals(args[index])) {
                inputPath = args[++index];
            } else if (ARGNAME_OUTPATH.equals(args[index])) {
                outputPath = args[++index];
            } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                s3AccessKey = args[++index];
            } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                s3SecretKey = args[++index];
            } else if (ARGNAME_MAXFILES.equals(args[index])) {
                // FIXME - No use of static methods
            } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[index]);
        } catch (ArrayIndexOutOfBoundsException e) {
            throw new IllegalArgumentException();

    if (inputPath == null || outputPath == null) {
        throw new IllegalArgumentException();

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();

    // Create the Hadoop job.
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    // Scan the provided input path for WARC files.
    LOG.info("setting input path to '" + inputPath + "'");

    FileInputFormat.addInputPath(job, new Path(inputPath));

    // FIXME - I see the problem that you want to give a dynamic number to a
    // static class. My question is, Is this really required, if we just
    // point to a file in s3 that should solve our problem
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
     * // Defines additional single text based output 'GoogleAdClient' for
     * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
     * TextOutputFormat.class, Text.class,LongWritable.class );
     * // Defines additional text based output 'GoogleAdType' for the job
     * MultipleOutputs.addNamedOutput(job,
     * "GoogleAdType",TextOutputFormat.class, Text.class,
     * LongWritable.class);
    // Set which InputFormat class to use.

    // Set which OutputFormat class to use.

     * Using MultipleOutputs creates zero-sized default output e.g.: *
     * part-r-00000. To prevent this use LazyOutputFormat instead of
     * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
     * configuration.
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
    // Set the output data types.

    // Set which Mapper and Reducer classes to use.
    // job.setMapperClass(CrawlMapper_AdStatsDetails.class);

    // set combiner

    // set job name
    job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {

        LOG.info("Job completion status : " + job.waitForCompletion(true));
        long endTime = System.currentTimeMillis();

        long difference = endTime - startTime;
        LOG.info("Elapsed milliseconds: " + difference);
        Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
        LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

        Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
        LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

        return 0;
    } else {
        return 1;

From source file:com.talis.mapreduce.dicenc.ThirdDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        return -1;
    }/*www  .j a  v a2  s  .c  o  m*/

    Job job = new Job(getConf(), "third");

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.setInputPathFilter(job, DataPathFilter.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));



    return job.waitForCompletion(true) ? 0 : 1;

From source file:datafu.hourglass.jobs.StagedOutputJob.java

License:Apache License

 * Creates a job which using a temporary staging location for the output data.
 * The data is only copied to the final output directory on successful completion
 * of the job.  This prevents existing output data from being overwritten unless
 * the job completes successfully./*from  w ww .j  a va2  s  .co m*/
 * @param conf configuration
 * @param jobName job name
 * @param inputPaths input paths
 * @param stagingLocation where to stage output temporarily
 * @param outputPath output path
 * @param log logger 
 * @return job
public static StagedOutputJob createStagedJob(Configuration conf, String jobName, List<String> inputPaths,
        String stagingLocation, String outputPath, final Logger log) {
    final StagedOutputJob retVal;
    try {
        retVal = new StagedOutputJob(conf, stagingLocation, log);
        FileInputFormat.setInputPathFilter(retVal, HiddenFilePathFilter.class);
    } catch (IOException e) {
        log.error("IOException when making a job", e);
        throw new RuntimeException(e);

    if (inputPaths != null) {
        try {
            FileInputFormat.setInputPaths(retVal, StringUtils.join(inputPaths.iterator(), ","));
        } catch (IOException e) {
            log.error("Unable to set up input paths.", e);
            throw new RuntimeException(e);

    FileOutputFormat.setOutputPath(retVal, new Path(outputPath));

    return retVal;

From source file:org.apache.jena.tdbloader4.ThirdDriver.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName());
        return -1;
    }// www . j  av  a2 s . c  o m

    log.debug("input: {}, output: {}", args[0], args[1]);

    Configuration configuration = getConf();
    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
    log.debug("Compression is {}", useCompression ? "enabled" : "disabled");

    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

    Job job = new Job(configuration);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileInputFormat.setInputPathFilter(job, ExcludeNodeTableFilter.class);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));




    Utils.setReducers(job, configuration, log);


    if (useCompression) {
        SequenceFileOutputFormat.setCompressOutput(job, true);
        SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);

    if (log.isDebugEnabled())
        Utils.log(job, log);

    return job.waitForCompletion(true) ? 0 : 1;

From source file:org.apache.pig.piggybank.storage.HadoopJobHistoryLoader.java

License:Apache License

public void setLocation(String location, Job job) throws IOException {
    FileInputFormat.setInputPaths(job, location);
    FileInputFormat.setInputPathFilter(job, JobHistoryPathFilter.class);

From source file:org.apache.pig.test.PigTestLoader.java

License:Apache License

public void setLocation(String location, Job job) throws IOException {
    super.setLocation(location, job);
    FileInputFormat.setInputPathFilter(job, TestPathFilter.class);
    test = true;/*from   w  w w . j  a  v a  2 s . co  m*/