List of usage examples for org.apache.hadoop.fs FileSystem delete
public abstract boolean delete(Path f, boolean recursive) throws IOException;
From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java
@Override public int run(String[] strings) throws Exception { GiraphConfiguration gconf = new GiraphConfiguration(conf); //gconf.setVertexClass(SimpleShortestPathsComputation.class); /*gconf.setVertexInputFormatClass( SimpleShortestPathsVertexInputFormat.class); gconf.setVertexOutputFormatClass(// w w w .j a va 2 s .c o m SimpleShortestPathsVertexOutputFormat.class); */ CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings); if (null == cmd) { return 0; } //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName()); GiraphJob job = new GiraphJob(gconf, getClass().getName()); job.getInternalJob().setJarByClass(getClass()); if (cmd.hasOption("vof") || cmd.hasOption("eof")) { if (cmd.hasOption("op")) { Path outputPath = new Path(cmd.getOptionValue("op")); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); /*Check if output path (args[1])exist or not*/ if (fs.exists(outputPath)) { /*If exist delete the output path*/ fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath); } } /* if (cmd.hasOption("vif") || cmd.hasOption("eif")) { if (cmd.hasOption("vip")) { FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op"))); } }*/ //If there is a custom option specified if (cmd.hasOption("ca")) { String[] args = cmd.getOptionValues("ca"); LOG.fatal("" + Arrays.toString(args)); gconf.set("ca", args[0].split("=")[1]); LOG.fatal("" + gconf.get("ca")); gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")), Integer.parseInt(cmd.getOptionValue("w")), 100.0f); } /* if (cmd.hasOption("cf")) { DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")), job.getConfiguration()); } */ return job.run(true) ? 0 : -1; }
From source file:bixo.examples.crawl.DemoCrawlTool.java
License:Apache License
public static void main(String[] args) { DemoCrawlToolOptions options = new DemoCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {// w ww .j a v a2 s . co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user want to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); // defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); // this is to cause Bixo to block waiting for next time it can fetch from a particular site. // todo: may not be necessary in future versions of Bixo // defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()])); // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.crawl.JDBCCrawlTool.java
License:Apache License
public static void main(String[] args) { JDBCCrawlToolOptions options = new JDBCCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*from ww w. j av a 2 s.c o m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain != null) { validateDomain(domain, parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user is starting from scratch if (options.getDbLocation() == null) { if (fs.exists(outputPath)) { System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName); System.out.println("Warning: Delete the output dir before running"); fs.delete(outputPath, true); } } else { Path dbLocationPath = new Path(options.getDbLocation()); if (!fs.exists(dbLocationPath)) { fs.mkdirs(dbLocationPath); } } if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); if (domain == null) { System.err.println("For a new crawl the domain needs to be specified" + domain); printUsageAndExit(parser); } importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf); } Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (inputPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } int startLoop = CrawlDirUtils.extractLoopNumber(inputPath); int endLoop = startLoop + options.getNumLoops(); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()])); // Now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter, options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation()); flow.complete(); // flow.writeDOT("build/valid-flow.dot"); // Input for the next round is our current output inputPath = curLoopDir; } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } JDBCTapFactory.shutdown(); }
From source file:bixo.examples.JDBCCrawlTool.java
License:Open Source License
public static void main(String[] args) { JDBCCrawlToolOptions options = new JDBCCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from w ww . jav a 2s . co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain.startsWith("http")) { System.err.println( "The target domain should be specified as just the host, without the http protocol: " + domain); printUsageAndExit(parser); } if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) { System.err.println( "The target domain should be a valid paid-level domain or subdomain of the same: " + domain); printUsageAndExit(parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user is starting from scratch if (options.getDbLocation() == null) { if (fs.exists(outputPath)) { System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName); System.out.println("Warning: Delete the output dir before running"); fs.delete(outputPath, true); } } else { Path dbLocationPath = new Path(options.getDbLocation()); if (!fs.exists(dbLocationPath)) { fs.mkdirs(dbLocationPath); } } if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf); } Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (inputPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } int startLoop = CrawlDirUtils.extractLoopNumber(inputPath); int endLoop = startLoop + options.getNumLoops(); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; BaseUrlFilter urlFilter = new DomainUrlFilter(domain); // Now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter, options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation()); flow.complete(); // flow.writeDOT("build/valid-flow.dot"); // Input for the next round is our current output inputPath = curLoopDir; } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } JDBCTapFactory.shutdown(); }
From source file:bixo.examples.webmining.DemoWebMiningTool.java
License:Apache License
static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName) throws Exception { // Check if we already have a crawldb Path crawlDbPath = null;/*ww w. j a va 2 s. co m*/ Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (loopDirPath != null) { // Clear out any previous loop directory, so we're always starting from scratch LOGGER.info("deleting existing working dir"); while (loopDirPath != null) { fs.delete(loopDirPath, true); loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); } } // Create a "0-<timestamp>" loop sub-directory and import the seed urls loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0); crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); DemoWebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName); }
From source file:bixo.examples.webmining.WebMiningTool.java
License:Apache License
static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName) throws Exception { // Check if we already have a crawldb Path crawlDbPath = null;// w ww.ja v a 2 s .com Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (loopDirPath != null) { // Clear out any previous loop directory, so we're always starting from scratch LOGGER.info("deleting existing working dir"); while (loopDirPath != null) { fs.delete(loopDirPath, true); loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); } } // Create a "0-<timestamp>" loop sub-directory and import the seed urls loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0); crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); WebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName); }
From source file:boa.datagen.SeqSortMerge.java
License:Apache License
public static void main(String[] args) throws IOException { conf.set("fs.default.name", base); FileSystem fs = FileSystem.get(conf); String inPath = "/tmprepcache/2015-07-sorted/"; while (true) { FileStatus[] files = fs.listStatus(new Path(inPath)); if (files.length < 2) break; Path path = new Path(inPath + System.currentTimeMillis()); fs.mkdirs(path);//from ww w. j a v a 2 s .c o m SequenceFile.Writer w = SequenceFile.createWriter(fs, conf, new Path(inPath + path.getName() + "/part-00000"), Text.class, BytesWritable.class); FileStatus[] candidates = getCandidates(files); System.out.println("Merging " + candidates.length + " from " + files.length); SequenceFile.Reader[] readers = new SequenceFile.Reader[candidates.length]; for (int i = 0; i < candidates.length; i++) readers[i] = new SequenceFile.Reader(fs, new Path(inPath + candidates[i].getPath().getName() + "/part-00000"), conf); Text[] keys = new Text[candidates.length]; BytesWritable[] values = new BytesWritable[candidates.length]; read(readers, keys, values); while (true) { int index = min(keys); if (keys[index].toString().isEmpty()) break; w.append(keys[index], values[index]); read(readers[index], keys[index], values[index]); } for (int i = 0; i < readers.length; i++) readers[i].close(); w.close(); for (int i = 0; i < readers.length; i++) fs.delete(new Path(inPath + candidates[i].getPath().getName()), true); } }
From source file:boostingPL.driver.SAMMEPLDriver.java
License:Open Source License
@Override public int run(String[] args) throws Exception { int status = commandAnalysis(args); if (status != 0) { return status; }//w w w . j av a2 s .c om @SuppressWarnings("deprecation") Job job = new Job(getConf()); job.setJobName("SAMMEPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " " + numLinesPerMap + " " + numIterations); job.setJarByClass(SAMMEPLDriver.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, dataPath); NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap); FileSystem fs = modelPath.getFileSystem(getConf()); if (fs.exists(modelPath)) { fs.delete(modelPath, true); } job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, modelPath); if (runModel.equals("train")) { job.setMapperClass(AdaBoostPLMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(ClassifierWritable.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(ClassifierWritable.class); } else { job.setMapperClass(AdaBoostPLTestMapper.class); job.setReducerClass(AdaBoostPLTestReducer.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); } Configuration conf = job.getConfiguration(); conf.set("BoostingPL.boostingName", "SAMME"); conf.set("BoostingPL.numIterations", String.valueOf(numIterations)); conf.set("BoostingPL.modelPath", modelPath.toString()); if (metadataPath == null) { conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata"); } else { conf.set("BoostingPL.metadata", metadataPath.toString()); } if (outputFolder != null) { conf.set("BoostingPL.outputFolder", outputFolder.toString()); } LOG.info(StringUtils.arrayToString(args)); return job.waitForCompletion(true) == true ? 0 : -1; }
From source file:br.com.lassal.nqueens.grid.job.NQueenCounter.java
/** * Forma de chamada/*w w w . j a va2s . com*/ * <> {numero de rainhas} {diretorio raiz} -F * * @param strings * @return * @throws Exception */ public int run(String[] args) throws Exception { // Configuration processed by ToolRunner Configuration conf = getConf(); // Create a JobConf using the processed conf Job job = new Job(conf, "nqueens-counter"); job.setJarByClass(NQueenCounter.class); int queensNumber = Integer.parseInt(args[0]); String workingFolder = args.length >= 2 ? args[1] : null; boolean isFinal = args.length >= 3 && "-F".equals(args[2]) ? true : false; Path sourcePath = this.setWorkingFolder(queensNumber, workingFolder, isFinal, job); job.setOutputKeyClass(org.apache.hadoop.io.Text.class); job.setOutputValueClass(org.apache.hadoop.io.Text.class); if (isFinal) { job.setMapperClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterResultMapper.class); job.setReducerClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterResultReducer.class); } else { job.setMapperClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterMapper.class); job.setReducerClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterReducer.class); } // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (sourcePath != null) { FileSystem fs = FileSystem.get(conf); fs.delete(sourcePath, true); } return result ? 0 : 1; }
From source file:bulkload.ImportTsv.java
License:Apache License
/** * Sets up the actual job.//from ww w.jav a 2 s .c o m * * @param conf * The current configuration. * @param args * The command line parameters. * @return The newly created job. * @throws IOException * When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException { Job job = null; try (Connection connection = ConnectionFactory.createConnection(conf)) { try (Admin admin = connection.getAdmin()) { // Support non-XML supported characters // by re-encoding the passed separator as a Base64 string. String actualSeparator = conf.get(SEPARATOR_CONF_KEY); if (actualSeparator != null) { conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes())); } TableName tableName = TableName.valueOf(args[0]); if (!admin.tableExists(tableName)) { String errorMsg = format("Table '%s' does not exist.", tableName); LOG.error(errorMsg); throw new TableNotFoundException(errorMsg); } Path inputDir = new Path(args[1]); String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName.getNameAsString()); job = Job.getInstance(conf, jobName); job.setJarByClass(TsvImporter.class); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TsvImporter.class); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); if (hfileOutPath != null) { try (HTable table = (HTable) connection.getTable(tableName)) { Path outputDir = new Path(hfileOutPath); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputDir)) { if (!fs.delete(outputDir, true)) { throw new IllegalStateException("delete path:" + outputDir + " failed"); } } FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(Put.class); job.setReducerClass(PutSortReducer.class); HFileOutputFormat2.configureIncrementalLoad(job, table, table); } } else { // No reducers. Just write straight to table. Call // initTableReducerJob // to set up the TableOutputFormat. TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job); job.setNumReduceTasks(0); // TableMapReduceUtil.addDependencyJars(job); // TableMapReduceUtil.addDependencyJars(job.getConfiguration(), // com.google.common.base.Function.class /* Guava used by TsvParser */); } // Workaround to remove unnecessary hadoop dependencies String[] jars = job.getConfiguration().get("tmpjars").split(",", -1); StringBuilder filteredJars = new StringBuilder(); for (String j : jars) { String[] parts = j.split("/", -1); String fileName = parts[parts.length - 1]; if (fileName.indexOf("hadoop-") != 0) { filteredJars.append(j); filteredJars.append(","); } } job.getConfiguration().set("tmpjars", filteredJars.toString()); } } return job; }