List of usage examples for org.apache.hadoop.mapreduce Job setReducerClass
public void setReducerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:com.ifeng.vdn.parser.VideoLogParseDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());/* w ww . j a v a 2 s .c o m*/ FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogParseMapper.class); job.setReducerClass(VideoLogParseReducer.class); job.setCombinerClass(VideoLogParseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.ifeng.vdn.parser.VideoLogParseLocalDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(getClass());/* ww w . jav a2 s . com*/ FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(VideoLogParseMapper.class); job.setReducerClass(VideoLogParseReducer.class); job.setCombinerClass(VideoLogParseReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.iflytek.spider.crawl.CrawlDb.java
License:Apache License
public static Job createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Job job = AvroJob.getAvroJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(config).exists(current)) { FileInputFormat.addInputPath(job, current); }/*w ww. j a v a2 s . c o m*/ job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setReducerClass(CrawlDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); return job; }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or not * is read from the crawl.generate.filter property in the configuration files. * If the property is not found, the URLs are filtered. Same for the * normalisation.//from ww w . jav a2 s .c o m * * @param dbDir * Crawl database directory * @param segments * Segments directory * @param numLists * Number of reduce tasks * @param curTime * Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException * When an I/O error occurs * @throws ClassNotFoundException * @throws InterruptedException */ public Path[] generate(Path dbDir, Path segments, int numLists, long curTime, boolean force) throws IOException, InterruptedException, ClassNotFoundException { //getConf().set("mapred.temp.dir", "d:/tmp"); Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); LOG.info("Generator: Selecting best-scoring urls due for fetch."); LOG.info("Generator: starting"); Job job = AvroJob.getAvroJob(getConf()); if (numLists == -1) { // for politeness make numLists = job.getNumReduceTasks(); // a partition per fetch task } if ("local".equals(job.getConfiguration().get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } LOG.info("Generator: with " + numLists + " partition."); job.getConfiguration().setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorMapper.class); job.setReducerClass(SelectorReducer.class); FileOutputFormat.setOutputPath(job, tempDir); //job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputFormatClass(GeneratorOutputFormat.class); job.setOutputKeyClass(Float.class); job.setOutputValueClass(SelectorEntry.class); // AvroMultipleOutputs.addNamedOutput(job, "seq", // AvroPairOutputFormat.class, Float.class, SelectorEntry.class); try { job.waitForCompletion(true); } catch (IOException e) { e.printStackTrace(); return null; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) continue; // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); fs.createNewFile(new Path(newSeg, "generatored")); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("Generator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("Generator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.getConfiguration().setLong(Spider.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(CrawlDbUpdateMapper.class); // job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormatClass(AvroMapOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { job.waitForCompletion(true); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); if (LOG.isInfoEnabled()) { LOG.info("Generator: done."); } Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); }
From source file:com.iflytek.spider.crawl.GeneratorSmart.java
License:Apache License
private Path partitionSegment(FileSystem fs, Path segmentsDir, Path inputDir, int numLists) throws IOException, InterruptedException, ClassNotFoundException { // invert again, partition by host/domain/IP, sort by url hash if (LOG.isInfoEnabled()) { LOG.info("Generator: Partitioning selected urls for politeness:" + inputDir); }// w ww. j a v a 2 s.c om Path segment = new Path(segmentsDir, generateSegmentName()); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); LOG.info("Generator: segment: " + segment + " with " + numLists + " Fetchers"); Job job = AvroJob.getAvroJob(getConf()); job.setJobName("generate: partition " + segment); job.getConfiguration().setInt("partition.url.seed", new Random().nextInt()); FileInputFormat.addInputPath(job, inputDir); job.setInputFormatClass(AvroPairInputFormat.class); job.setMapperClass(SelectorInverseMapper.class); job.setPartitionerClass(AveragePartition.class); job.setMapOutputKeyClass(String.class); job.setMapOutputValueClass(SelectorEntry.class); job.setReducerClass(PartitionReducer.class); job.setNumReduceTasks(numLists); FileOutputFormat.setOutputPath(job, output); job.setOutputFormatClass(AvroPairOutputFormat.class); job.setOutputKeyClass(String.class); job.setOutputValueClass(CrawlDatum.class); job.waitForCompletion(true); return segment; }
From source file:com.igalia.metamail.jobs.MessagesByTimePeriod.java
License:Open Source License
private static Job setupJob() throws IOException, InterruptedException, ClassNotFoundException { Configuration config = HBaseConfiguration.create(); Job job = new Job(config, "MessagesByTimePeriod"); job.setJarByClass(MessagesByTimePeriod.class); Scan scan = new Scan(); scan.setCaching(500);/* w ww. j a v a 2 s. c om*/ scan.setCacheBlocks(false); // don't set to true for MR jobs // Mapper TableMapReduceUtil.initTableMapperJob(mailsTable, // input HBase table name scan, // Scan instance to control CF and attribute selection MessagesByTimePeriod.MessagesByTimePeriodMapper.class, Text.class, IntWritable.class, job); // Reducer job.setReducerClass(MessagesByTimePeriod.MessagesByTimePeriodReducer.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, new Path(MessagesByTimePeriod.MAIL_OUT)); return job; }
From source file:com.igalia.wordcount.WordCount.java
License:Open Source License
public int run(String[] arg0) throws Exception { Job job = new Job(getConf()); job.setJarByClass(WordCount.class); job.setJobName("wordcount"); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MapClass.class); job.setReducerClass(Reduce.class); FileInputFormat.setInputPaths(job, new Path("/tmp/wordcount/in")); FileOutputFormat.setOutputPath(job, new Path("/tmp/wordcount/out")); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:com.ikanow.infinit.e.core.mapreduce.HadoopJobRunner.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) private String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); createConfigXML(xml, job.jobtitle, job.inputCollection, getQueryOrProcessing(job.query, QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), job.outputCollectionTemp, job.mapper, job.reducer, job.combiner, getQueryOrProcessing(job.query, QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments);// w w w.j a va 2s .c om ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Now load the XML into a configuration object: Configuration config = new Configuration(); try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Now run the JAR file try { config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { String trackerUrl = HadoopUtils.getXMLProperty( prop_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); String fsUrl = HadoopUtils.getXMLProperty( prop_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("mapred.job.tracker", trackerUrl); config.set("fs.default.name", fsUrl); } Job hj = new Job(config); Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); if ((null != job.exportToHdfs) && job.exportToHdfs) { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); Path outPath = this.ensureOutputDirectory(job); SequenceFileOutputFormat.setOutputPath(hj, outPath); } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("com.mongodb.hadoop.MongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) Class.forName(job.mapper, true, child)); if ((null != job.reducer) && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); if (bLocalMode) { hj.waitForCompletion(false); return "local_done"; } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + HarvestExceptionUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }
From source file:com.ikanow.infinit.e.processing.custom.launcher.CustomHadoopTaskLauncher.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public String runHadoopJob(CustomMapReduceJobPojo job, String tempJarLocation) throws IOException, SAXException, ParserConfigurationException { StringWriter xml = new StringWriter(); String outputCollection = job.outputCollectionTemp;// (non-append mode) if ((null != job.appendResults) && job.appendResults) outputCollection = job.outputCollection; // (append mode, write directly in....) else if (null != job.incrementalMode) job.incrementalMode = false; // (not allowed to be in incremental mode and not update mode) createConfigXML(xml, job.jobtitle, job.inputCollection, InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.INPUTFIELDS), job.isCustomTable, job.getOutputDatabase(), job._id.toString(), outputCollection, job.mapper, job.reducer, job.combiner,//from w w w . ja v a2s .c om InfiniteHadoopUtils.getQueryOrProcessing(job.query, InfiniteHadoopUtils.QuerySpec.QUERY), job.communityIds, job.outputKey, job.outputValue, job.arguments, job.incrementalMode, job.submitterID, job.selfMerge, job.outputCollection, job.appendResults); ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader(); URLClassLoader child = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, savedClassLoader); Thread.currentThread().setContextClassLoader(child); // Check version: for now, any infinit.e.data_model with an VersionTest class is acceptable boolean dataModelLoaded = true; try { URLClassLoader versionTest = new URLClassLoader(new URL[] { new File(tempJarLocation).toURI().toURL() }, null); try { Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, versionTest); } catch (ClassNotFoundException e2) { //(this is fine, will use the cached version) dataModelLoaded = false; } if (dataModelLoaded) Class.forName("com.ikanow.infinit.e.data_model.custom.InfiniteMongoVersionTest", true, versionTest); } catch (ClassNotFoundException e1) { throw new RuntimeException( "This JAR is compiled with too old a version of the data-model, please recompile with Jan 2014 (rc2) onwards"); } // Now load the XML into a configuration object: Configuration config = new Configuration(); // Add the client configuration overrides: if (!bLocalMode) { String hadoopConfigPath = props_custom.getHadoopConfigPath() + "/hadoop/"; config.addResource(new Path(hadoopConfigPath + "core-site.xml")); config.addResource(new Path(hadoopConfigPath + "mapred-site.xml")); config.addResource(new Path(hadoopConfigPath + "hadoop-site.xml")); } //TESTED try { DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); Document doc = dBuilder.parse(new ByteArrayInputStream(xml.toString().getBytes())); NodeList nList = doc.getElementsByTagName("property"); for (int temp = 0; temp < nList.getLength(); temp++) { Node nNode = nList.item(temp); if (nNode.getNodeType() == Node.ELEMENT_NODE) { Element eElement = (Element) nNode; String name = getTagValue("name", eElement); String value = getTagValue("value", eElement); if ((null != name) && (null != value)) { config.set(name, value); } } } } catch (Exception e) { throw new IOException(e.getMessage()); } // Some other config defaults: // (not sure if these are actually applied, or derived from the defaults - for some reason they don't appear in CDH's client config) config.set("mapred.map.tasks.speculative.execution", "false"); config.set("mapred.reduce.tasks.speculative.execution", "false"); // (default security is ignored here, have it set via HADOOP_TASKTRACKER_CONF in cloudera) // Now run the JAR file try { BasicDBObject advancedConfigurationDbo = null; try { advancedConfigurationDbo = (null != job.query) ? ((BasicDBObject) com.mongodb.util.JSON.parse(job.query)) : (new BasicDBObject()); } catch (Exception e) { advancedConfigurationDbo = new BasicDBObject(); } boolean esMode = advancedConfigurationDbo.containsField("qt") && !job.isCustomTable; if (esMode && !job.inputCollection.equals("doc_metadata.metadata")) { throw new RuntimeException( "Infinit.e Queries are only supported on doc_metadata - use MongoDB queries instead."); } config.setBoolean("mapred.used.genericoptionsparser", true); // (just stops an annoying warning from appearing) if (bLocalMode) { // local job tracker and FS mode config.set("mapred.job.tracker", "local"); config.set("fs.default.name", "local"); } else { if (bTestMode) { // run job tracker locally but FS mode remotely config.set("mapred.job.tracker", "local"); } else { // normal job tracker String trackerUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/mapred-site.xml", "mapred.job.tracker"); config.set("mapred.job.tracker", trackerUrl); } String fsUrl = HadoopUtils.getXMLProperty( props_custom.getHadoopConfigPath() + "/hadoop/core-site.xml", "fs.default.name"); config.set("fs.default.name", fsUrl); } if (!dataModelLoaded && !(bTestMode || bLocalMode)) { // If running distributed and no data model loaded then add ourselves Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.data_model.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/", "infinit.e.processing.custom.library.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } //TESTED // Debug scripts (only if they exist), and only in non local/test mode if (!bLocalMode && !bTestMode) { try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_map_error_handler.sh", config); config.set("mapred.map.task.debug.script", "custom_map_error_handler.sh " + job.jobtitle); config.set("mapreduce.map.debug.script", "custom_map_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on try { Path scriptToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/scripts/", "custom_reduce_error_handler.sh", config); config.set("mapred.reduce.task.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); config.set("mapreduce.reduce.debug.script", "custom_reduce_error_handler.sh " + job.jobtitle); DistributedCache.createSymlink(config); DistributedCache.addCacheFile(scriptToCache.toUri(), config); } catch (Exception e) { } // just carry on } //TODO (???): TOTEST // (need to do these 2 things here before the job is created, at which point the config class has been copied across) //1) Class<?> mapperClazz = Class.forName(job.mapper, true, child); if (ICustomInfiniteInternalEngine.class.isAssignableFrom(mapperClazz)) { // Special case: internal custom engine, so gets an additional integration hook ICustomInfiniteInternalEngine preActivities = (ICustomInfiniteInternalEngine) mapperClazz .newInstance(); preActivities.preTaskActivities(job._id, job.communityIds, config, !(bTestMode || bLocalMode)); } //TESTED //2) if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { // Need to download the GridFSZip file try { Path jarToCache = InfiniteHadoopUtils.cacheLocalFile("/opt/infinite-home/lib/unbundled/", "GridFSZipFile.jar", config); DistributedCache.addFileToClassPath(jarToCache, config); } catch (Throwable t) { } // (this is fine, will already be on the classpath .. otherwise lots of other stuff will be failing all over the place!) } if (job.inputCollection.equals("records")) { InfiniteElasticsearchHadoopUtils.handleElasticsearchInput(job, config, advancedConfigurationDbo); //(won't run under 0.19 so running with "records" should cause all sorts of exceptions) } //TESTED (by hand) if (bTestMode || bLocalMode) { // If running locally, turn "snappy" off - tomcat isn't pointing its native library path in the right place config.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.DefaultCodec"); } // Manually specified caches List<URL> localJarCaches = InfiniteHadoopUtils.handleCacheList(advancedConfigurationDbo.get("$caches"), job, config, props_custom); Job hj = new Job(config); // (NOTE: from here, changes to config are ignored) try { if (null != localJarCaches) { if (bLocalMode || bTestMode) { Method method = URLClassLoader.class.getDeclaredMethod("addURL", new Class[] { URL.class }); method.setAccessible(true); method.invoke(child, localJarCaches.toArray()); } //TOTEST (tested logically) } Class<?> classToLoad = Class.forName(job.mapper, true, child); hj.setJarByClass(classToLoad); if (job.inputCollection.equalsIgnoreCase("filesystem")) { String inputPath = null; try { inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); if (!inputPath.endsWith("/")) { inputPath = inputPath + "/"; } } catch (Exception e) { } if (null == inputPath) { throw new RuntimeException("Must specify 'file.url' if reading from filesystem."); } inputPath = InfiniteHadoopUtils.authenticateInputDirectory(job, inputPath); InfiniteFileInputFormat.addInputPath(hj, new Path(inputPath + "*/*")); // (that extra bit makes it recursive) InfiniteFileInputFormat.setMaxInputSplitSize(hj, 33554432); // (32MB) InfiniteFileInputFormat.setInfiniteInputPathFilter(hj, config); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat", true, child)); } else if (job.inputCollection.equalsIgnoreCase("file.binary_shares")) { String[] oidStrs = null; try { String inputPath = MongoDbUtil.getProperty(advancedConfigurationDbo, "file.url"); Pattern oidExtractor = Pattern.compile("inf://share/([^/]+)"); Matcher m = oidExtractor.matcher(inputPath); if (m.find()) { oidStrs = m.group(1).split("\\s*,\\s*"); } else { throw new RuntimeException( "file.url must be in format inf://share/<oid-list>/<string>: " + inputPath); } InfiniteHadoopUtils.authenticateShareList(job, oidStrs); } catch (Exception e) { throw new RuntimeException( "Authentication error: " + e.getMessage() + ": " + advancedConfigurationDbo, e); } hj.getConfiguration().setStrings("mapred.input.dir", oidStrs); hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat", true, child)); } else if (job.inputCollection.equals("records")) { hj.setInputFormatClass((Class<? extends InputFormat>) Class .forName("com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat", true, child)); } else { if (esMode) { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat", true, child)); } else { hj.setInputFormatClass((Class<? extends InputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat", true, child)); } } if ((null != job.exportToHdfs) && job.exportToHdfs) { //TODO (INF-2469): Also, if the output key is BSON then also run as text (but output as JSON?) Path outPath = InfiniteHadoopUtils.ensureOutputDirectory(job, props_custom); if ((null != job.outputKey) && (null != job.outputValue) && job.outputKey.equalsIgnoreCase("org.apache.hadoop.io.text") && job.outputValue.equalsIgnoreCase("org.apache.hadoop.io.text")) { // (slight hack before I sort out the horrendous job class - if key/val both text and exporting to HDFS then output as Text) hj.setOutputFormatClass((Class<? extends OutputFormat>) Class .forName("org.apache.hadoop.mapreduce.lib.output.TextOutputFormat", true, child)); TextOutputFormat.setOutputPath(hj, outPath); } //TESTED else { hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat", true, child)); SequenceFileOutputFormat.setOutputPath(hj, outPath); } //TESTED } else { // normal case, stays in MongoDB hj.setOutputFormatClass((Class<? extends OutputFormat>) Class.forName( "com.ikanow.infinit.e.data_model.custom.InfiniteMongoOutputFormat", true, child)); } hj.setMapperClass((Class<? extends Mapper>) mapperClazz); String mapperOutputKeyOverride = advancedConfigurationDbo.getString("$mapper_key_class", null); if (null != mapperOutputKeyOverride) { hj.setMapOutputKeyClass(Class.forName(mapperOutputKeyOverride)); } //TESTED String mapperOutputValueOverride = advancedConfigurationDbo.getString("$mapper_value_class", null); if (null != mapperOutputValueOverride) { hj.setMapOutputValueClass(Class.forName(mapperOutputValueOverride)); } //TESTED if ((null != job.reducer) && !job.reducer.startsWith("#") && !job.reducer.equalsIgnoreCase("null") && !job.reducer.equalsIgnoreCase("none")) { hj.setReducerClass((Class<? extends Reducer>) Class.forName(job.reducer, true, child)); // Variable reducers: if (null != job.query) { try { hj.setNumReduceTasks(advancedConfigurationDbo.getInt("$reducers", 1)); } catch (Exception e) { try { // (just check it's not a string that is a valid int) hj.setNumReduceTasks( Integer.parseInt(advancedConfigurationDbo.getString("$reducers", "1"))); } catch (Exception e2) { } } } //TESTED } else { hj.setNumReduceTasks(0); } if ((null != job.combiner) && !job.combiner.startsWith("#") && !job.combiner.equalsIgnoreCase("null") && !job.combiner.equalsIgnoreCase("none")) { hj.setCombinerClass((Class<? extends Reducer>) Class.forName(job.combiner, true, child)); } hj.setOutputKeyClass(Class.forName(job.outputKey, true, child)); hj.setOutputValueClass(Class.forName(job.outputValue, true, child)); hj.setJobName(job.jobtitle); currJobName = job.jobtitle; } catch (Error e) { // (messing about with class loaders = lots of chances for errors!) throw new RuntimeException(e.getMessage(), e); } if (bTestMode || bLocalMode) { hj.submit(); currThreadId = null; Logger.getRootLogger().addAppender(this); currLocalJobId = hj.getJobID().toString(); currLocalJobErrs.setLength(0); while (!hj.isComplete()) { Thread.sleep(1000); } Logger.getRootLogger().removeAppender(this); if (hj.isSuccessful()) { if (this.currLocalJobErrs.length() > 0) { return "local_done: " + this.currLocalJobErrs.toString(); } else { return "local_done"; } } else { return "Error: " + this.currLocalJobErrs.toString(); } } else { hj.submit(); String jobId = hj.getJobID().toString(); return jobId; } } catch (Exception e) { e.printStackTrace(); Thread.currentThread().setContextClassLoader(savedClassLoader); return "Error: " + InfiniteHadoopUtils.createExceptionMessage(e); } finally { Thread.currentThread().setContextClassLoader(savedClassLoader); } }
From source file:com.impetus.code.examples.hadoop.cassandra.wordcount.WordCount.java
License:Apache License
public int run(String[] args) throws Exception { String outputReducerType = "cassandra"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1];/*from w w w.j a va 2s . c o m*/ } logger.info("output reducer type: " + outputReducerType); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "text" + i; getConf().set(CONF_COLUMN_NAME, columnName); Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setCombinerClass(ReducerToFilesystem.class); job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setPartitioner(job.getConfiguration(), "org.apache.cassandra.dht.RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, INPUT_COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); job.waitForCompletion(true); } return 0; }