List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat setInputPathFilter
public static void setInputPathFilter(Job job, Class<? extends PathFilter> filter)
From source file:co.cask.cdap.template.etl.batch.source.FileBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { //SimpleDateFormat needs to be local because it is not threadsafe SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); //calculate date one hour ago, rounded down to the nearest hour prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1)); Calendar cal = Calendar.getInstance(); cal.setTime(prevHour);//from ww w .ja v a 2s . c o m cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); prevHour = cal.getTime(); Job job = context.getHadoopJob(); Configuration conf = job.getConfiguration(); if (config.fileSystemProperties != null) { Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE); for (Map.Entry<String, String> entry : properties.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } } if (config.fileRegex != null) { conf.set(INPUT_REGEX_CONFIG, config.fileRegex); } conf.set(INPUT_NAME_CONFIG, config.path); if (config.timeTable != null) { table = context.getDataset(config.timeTable); datesToRead = Bytes.toString(table.read(LAST_TIME_READ)); if (datesToRead == null) { List<Date> firstRun = Lists.newArrayList(new Date(0)); datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE); } List<Date> attempted = Lists.newArrayList(prevHour); String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE); if (!updatedDatesToRead.equals(datesToRead)) { table.write(LAST_TIME_READ, updatedDatesToRead); } conf.set(LAST_TIME_READ, datesToRead); } conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour)); if (!Strings.isNullOrEmpty(config.inputFormatClass)) { ClassLoader classLoader = Thread.currentThread().getContextClassLoader(); Class<? extends FileInputFormat> classType = (Class<? extends FileInputFormat>) classLoader .loadClass(config.inputFormatClass); job.setInputFormatClass(classType); } else { job.setInputFormatClass(CombineTextInputFormat.class); } FileInputFormat.setInputPathFilter(job, BatchFileFilter.class); FileInputFormat.addInputPath(job, new Path(config.path)); long maxSplitSize; try { maxSplitSize = Long.parseLong(config.maxSplitSize); } catch (NumberFormatException e) { maxSplitSize = DEFAULT_SPLIT_SIZE; } CombineTextInputFormat.setMaxInputSplitSize(job, maxSplitSize); }
From source file:co.cask.hydrator.plugin.batch.source.FileBatchSource.java
License:Apache License
@Override public void prepareRun(BatchSourceContext context) throws Exception { //SimpleDateFormat needs to be local because it is not threadsafe SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd-HH"); //calculate date one hour ago, rounded down to the nearest hour prevHour = new Date(context.getLogicalStartTime() - TimeUnit.HOURS.toMillis(1)); Calendar cal = Calendar.getInstance(); cal.setTime(prevHour);/*from www . ja v a 2 s .c o m*/ cal.set(Calendar.MINUTE, 0); cal.set(Calendar.SECOND, 0); cal.set(Calendar.MILLISECOND, 0); prevHour = cal.getTime(); Job job = JobUtils.createInstance(); Configuration conf = job.getConfiguration(); Map<String, String> properties = GSON.fromJson(config.fileSystemProperties, MAP_STRING_STRING_TYPE); //noinspection ConstantConditions for (Map.Entry<String, String> entry : properties.entrySet()) { conf.set(entry.getKey(), entry.getValue()); } conf.set(INPUT_REGEX_CONFIG, config.fileRegex); conf.set(INPUT_NAME_CONFIG, config.path); if (config.timeTable != null) { table = context.getDataset(config.timeTable); datesToRead = Bytes.toString(table.read(LAST_TIME_READ)); if (datesToRead == null) { List<Date> firstRun = Lists.newArrayList(new Date(0)); datesToRead = GSON.toJson(firstRun, ARRAYLIST_DATE_TYPE); } List<Date> attempted = Lists.newArrayList(prevHour); String updatedDatesToRead = GSON.toJson(attempted, ARRAYLIST_DATE_TYPE); if (!updatedDatesToRead.equals(datesToRead)) { table.write(LAST_TIME_READ, updatedDatesToRead); } conf.set(LAST_TIME_READ, datesToRead); } conf.set(CUTOFF_READ_TIME, dateFormat.format(prevHour)); FileInputFormat.setInputPathFilter(job, BatchFileFilter.class); FileInputFormat.addInputPath(job, new Path(config.path)); if (config.maxSplitSize != null) { FileInputFormat.setMaxInputSplitSize(job, config.maxSplitSize); } context.setInput( Input.of(config.referenceName, new SourceInputFormatProvider(config.inputFormatClass, conf))); }
From source file:com.linkedin.whiteelephant.mapreduce.lib.job.StagedOutputJob.java
License:Apache License
/** * Creates a job which using a temporary staging location for the output data. * The data is only copied to the final output directory on successful completion * of the job. This prevents existing output data from being overwritten unless * the job completes successfully./*from w w w .ja va 2s .c o m*/ * * @param props Job properties * @param jobName Name of the job * @param inputPaths Input paths job will be reading from * @param stagingLocation Where output of job should be staged * @param outputPath The final output location for the data * @param log The logger * @return The job */ public static StagedOutputJob createStagedJob(Properties props, String jobName, List<String> inputPaths, String stagingLocation, String outputPath, final Logger log) { Configuration config = createConfigurationFromProps(props); final StagedOutputJob retVal; try { retVal = new StagedOutputJob(config, stagingLocation, log); retVal.setJobName(jobName); retVal.setJarByClass(getCallersClass()); FileInputFormat.setInputPathFilter(retVal, HiddenFilePathFilter.class); } catch (IOException e) { log.error("IOException when making a job, wtf?", e); throw new RuntimeException(e); } try { FileInputFormat.setInputPaths(retVal, StringUtils.join(inputPaths.iterator(), ",")); } catch (IOException e) { log.error("Unable to set up input paths.", e); throw new RuntimeException(e); } FileOutputFormat.setOutputPath(retVal, new Path(outputPath)); return retVal; }
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounter.java
License:Apache License
/** * @param args/*from w w w .j a v a 2 s. c om*/ * @throws IOException * @throws Exception */ public static void main(String[] args) throws IOException, Exception { String inputPath = null; String outputPath = null; String master = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. for (int i = 0; i < args.length; i++) { try { if (args[i].equals(ARGNAME_INPATH)) { inputPath = args[++i]; } else if (args[i].equals(ARGNAME_OUTPATH)) { outputPath = args[++i]; } else if (args[i].equals(ARGNAME_MASTER)) { master = args[++i]; } else if (args[i].equals(ARGNAME_S3ACCESSKEY)) { s3AccessKey = args[++i]; } else if (args[i].equals(ARGNAME_S3SECRETKEY)) { s3SecretKey = args[++i]; } else if (args[i].equals(ARGNAME_MAXFILES)) { WarcFileFilter.setMax(Long.parseLong(args[++i])); } else if (args[i].equals(ARGNAME_OVERWRITE)) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[i]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } LOG.info(" inputPath :" + inputPath); if (inputPath == null || outputPath == null || master == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } SparkConf sparkConf = new SparkConf().setAppName("GoogleAdsCounter").setMaster(master); JavaSparkContext sc = new JavaSparkContext(sparkConf); Configuration conf = new Configuration(); Job job = Job.getInstance(conf); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } //define the accumulators to count total response pages and total Google Ad Pages final Accumulator<Integer> totalResponsePagesAccumulator = sc.accumulator(0); final Accumulator<Integer> totalGoogleAdPagesAccumulator = sc.accumulator(0); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); JavaPairRDD<LongWritable, WARCWritable> records = sc.newAPIHadoopFile(inputPath, WARCInputFormat.class, LongWritable.class, WARCWritable.class, job.getConfiguration()); JavaPairRDD<String, Integer> warcRecords = records .mapToPair(new PairFunction<Tuple2<LongWritable, WARCWritable>, String, Integer>() { public Tuple2<String, Integer> call(Tuple2<LongWritable, WARCWritable> record) throws Exception { String recordType = record._2().getRecord().getHeader().getRecordType(); String adType = null; if (recordType.equals("response")) { totalResponsePagesAccumulator.add(1); // total response pages String recordContent = new String(record._2().getRecord().getContent()); // parse Html content of web page using Jsoup Document doc = Jsoup.parse(recordContent); // Get the <script> tag elements Elements scriptElements = doc.getElementsByTag("script"); for (Element element : scriptElements) { // if web page has google ads, then <script> tag contains "google_ad_client" if (element.data().contains("google_ad_client")) { totalGoogleAdPagesAccumulator.add(1); GoogleAdParser parser = new DefaultParser(element.data()); String siteUrl = record._2().getRecord().getHeader().getTargetURI(); String title = "Default"; // FIXME String adClient = parser.getAttribute("google_ad_client") != null ? parser.getAttribute("google_ad_client") : "NA"; String adSlot = "default"; // FIXME String width = parser.getAttribute("google_ad_width") != null ? parser.getAttribute("google_ad_width") : "NA"; String height = parser.getAttribute("google_ad_height") != null ? parser.getAttribute("google_ad_height") : "NA"; adType = parser.getAttribute("google_ad_type") != null ? parser.getAttribute("google_ad_type") : "text"; } } return new Tuple2<String, Integer>(adType, 1); } else return new Tuple2<String, Integer>(adType, 1); } }); JavaPairRDD<String, Integer> adTypeCounts = warcRecords .reduceByKey(new Function2<Integer, Integer, Integer>() { public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } long startTime = System.currentTimeMillis(); //writing output to file adTypeCounts.saveAsNewAPIHadoopFile(outputPath, org.apache.hadoop.io.Text.class, org.apache.hadoop.io.Text.class, org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); //print accumulator values LOG.info(" totalResponsePagesAccumulator value : " + totalResponsePagesAccumulator.value()); LOG.info(" totalGoogleAdPagesAccumulator value : " + totalGoogleAdPagesAccumulator.value()); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); //stop spark context sc.stop(); }
From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java
License:Apache License
/** * Configures and submits the Map Reduce Job to Hadoop *//* w ww .ja v a2 s. c om*/ public int run(String[] args) throws Exception { String inputPath = null; String outputPath = null; boolean overwrite = false; String s3AccessKey = null; String s3SecretKey = null; // Read the command line arguments. We're not using GenericOptionsParser // to prevent having to include commons.cli as a dependency. for (int index = 0; index < args.length; index++) { try { if (ARGNAME_INPATH.equals(args[index])) { inputPath = args[++index]; } else if (ARGNAME_OUTPATH.equals(args[index])) { outputPath = args[++index]; } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) { s3AccessKey = args[++index]; } else if (ARGNAME_S3SECRETKEY.equals(args[index])) { s3SecretKey = args[++index]; } else if (ARGNAME_MAXFILES.equals(args[index])) { // FIXME - No use of static methods WarcFileFilter.setMax(Long.parseLong(args[++index])); } else if (ARGNAME_OVERWRITE.equals(args[index])) { overwrite = true; } else { LOG.warn("Unsupported argument: " + args[index]); } } catch (ArrayIndexOutOfBoundsException e) { usage(); throw new IllegalArgumentException(); } } if (inputPath == null || outputPath == null) { usage(); throw new IllegalArgumentException(); } if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) { usage(); LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage "); throw new IllegalArgumentException(); } // Create the Hadoop job. Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(GoogleAdsCounterJob.class); if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) { conf.set("AWS_ACCESS_KEY_ID", s3AccessKey); conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey); } // Scan the provided input path for WARC files. LOG.info("setting input path to '" + inputPath + "'"); WarcFileFilter.setFilter(FILEFILTER); FileInputFormat.addInputPath(job, new Path(inputPath)); // FIXME - I see the problem that you want to give a dynamic number to a // static class. My question is, Is this really required, if we just // point to a file in s3 that should solve our problem FileInputFormat.setInputPathFilter(job, WarcFileFilter.class); // Delete the output path directory if it already exists and user wants // to overwrite it. if (overwrite) { LOG.info("clearing the output path at '" + outputPath + "'"); FileSystem fs = FileSystem.get(new URI(outputPath), conf); if (fs.exists(new Path(outputPath))) { fs.delete(new Path(outputPath), true); } } // Set the path where final output 'part' files will be saved. LOG.info("setting output path to '" + outputPath + "'"); FileOutputFormat.setOutputPath(job, new Path(outputPath)); /* * // Defines additional single text based output 'GoogleAdClient' for * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient", * TextOutputFormat.class, Text.class,LongWritable.class ); * * // Defines additional text based output 'GoogleAdType' for the job * MultipleOutputs.addNamedOutput(job, * "GoogleAdType",TextOutputFormat.class, Text.class, * LongWritable.class); */ // Set which InputFormat class to use. job.setInputFormatClass(WARCInputFormat.class); // Set which OutputFormat class to use. job.setOutputFormatClass(TextOutputFormat.class); /* * Using MultipleOutputs creates zero-sized default output e.g.: * * part-r-00000. To prevent this use LazyOutputFormat instead of * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job * configuration. */ // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // job.setPartitionerClass(GoogleAdsCounterPartitioner.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); //job.setNumReduceTasks(4); // Set the output data types. job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // Set which Mapper and Reducer classes to use. job.setMapperClass(GoogleAdsCounterMapper.class); // job.setMapperClass(CrawlMapper_AdStatsDetails.class); job.setReducerClass(GoogleAdsCounterReducer.class); // set combiner //job.setCombinerClass(GoogleAdsCounterReducer.class); // set job name job.setJobName("CommonCrawl Data Processing : Counting Google Ads"); long startTime = System.currentTimeMillis(); if (job.waitForCompletion(true)) { LOG.info("Job completion status : " + job.waitForCompletion(true)); long endTime = System.currentTimeMillis(); long difference = endTime - startTime; LOG.info("Elapsed milliseconds: " + difference); Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES); LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue()); Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES); LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue()); return 0; } else { return 1; } }
From source file:com.talis.mapreduce.dicenc.ThirdDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }/*www .j a v a2 s .c o m*/ Job job = new Job(getConf(), "third"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.setInputPathFilter(job, DataPathFilter.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(ThirdMapper.class); job.setReducerClass(ThirdReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:datafu.hourglass.jobs.StagedOutputJob.java
License:Apache License
/** * Creates a job which using a temporary staging location for the output data. * The data is only copied to the final output directory on successful completion * of the job. This prevents existing output data from being overwritten unless * the job completes successfully./*from w ww .j a va2 s .co m*/ * * @param conf configuration * @param jobName job name * @param inputPaths input paths * @param stagingLocation where to stage output temporarily * @param outputPath output path * @param log logger * @return job */ public static StagedOutputJob createStagedJob(Configuration conf, String jobName, List<String> inputPaths, String stagingLocation, String outputPath, final Logger log) { final StagedOutputJob retVal; try { retVal = new StagedOutputJob(conf, stagingLocation, log); retVal.setJobName(jobName); retVal.setJarByClass(getCallersClass()); FileInputFormat.setInputPathFilter(retVal, HiddenFilePathFilter.class); } catch (IOException e) { log.error("IOException when making a job", e); throw new RuntimeException(e); } if (inputPaths != null) { try { FileInputFormat.setInputPaths(retVal, StringUtils.join(inputPaths.iterator(), ",")); } catch (IOException e) { log.error("Unable to set up input paths.", e); throw new RuntimeException(e); } } FileOutputFormat.setOutputPath(retVal, new Path(outputPath)); return retVal; }
From source file:org.apache.jena.tdbloader4.ThirdDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; }// www . j av a2 s . c o m log.debug("input: {}, output: {}", args[0], args[1]); Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); log.debug("Compression is {}", useCompression ? "enabled" : "disabled"); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); } Job job = new Job(configuration); job.setJobName(Constants.NAME_THIRD); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileInputFormat.setInputPathFilter(job, ExcludeNodeTableFilter.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(ThirdMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(ThirdReducer.class); job.setOutputKeyClass(LongQuadWritable.class); job.setOutputValueClass(NullWritable.class); Utils.setReducers(job, configuration, log); job.setOutputFormatClass(SequenceFileOutputFormat.class); if (useCompression) { SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); } if (log.isDebugEnabled()) Utils.log(job, log); return job.waitForCompletion(true) ? 0 : 1; }
From source file:org.apache.pig.piggybank.storage.HadoopJobHistoryLoader.java
License:Apache License
@Override public void setLocation(String location, Job job) throws IOException { FileInputFormat.setInputPaths(job, location); FileInputFormat.setInputPathFilter(job, JobHistoryPathFilter.class); }
From source file:org.apache.pig.test.PigTestLoader.java
License:Apache License
@Override public void setLocation(String location, Job job) throws IOException { super.setLocation(location, job); FileInputFormat.setInputPathFilter(job, TestPathFilter.class); test = true;/*from w w w . j a v a 2 s . co m*/ }