Example usage for org.apache.hadoop.fs FileSystem delete

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem delete.

Prototype

public abstract boolean delete(Path f, boolean recursive) throws IOException;

Source Link

Document

Delete a file.

Usage

From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java

@Override
public int run(String[] strings) throws Exception {

    GiraphConfiguration gconf = new GiraphConfiguration(conf);
    //gconf.setVertexClass(SimpleShortestPathsComputation.class);
    /*gconf.setVertexInputFormatClass(
        SimpleShortestPathsVertexInputFormat.class);
    gconf.setVertexOutputFormatClass(// w w  w  .j a  va 2  s .c  o  m
        SimpleShortestPathsVertexOutputFormat.class);
    */
    CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings);
    if (null == cmd) {
        return 0;
    }

    //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName());
    GiraphJob job = new GiraphJob(gconf, getClass().getName());
    job.getInternalJob().setJarByClass(getClass());
    if (cmd.hasOption("vof") || cmd.hasOption("eof")) {
        if (cmd.hasOption("op")) {
            Path outputPath = new Path(cmd.getOptionValue("op"));

            FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
            /*Check if output path (args[1])exist or not*/
            if (fs.exists(outputPath)) {
                /*If exist delete the output path*/
                fs.delete(outputPath, true);
            }

            FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath);
        }
    }
    /*
    if (cmd.hasOption("vif") || cmd.hasOption("eif")) {
      if (cmd.hasOption("vip")) {
          FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op")));
      }
    }*/
    //If there is a custom option specified
    if (cmd.hasOption("ca")) {
        String[] args = cmd.getOptionValues("ca");
        LOG.fatal("" + Arrays.toString(args));

        gconf.set("ca", args[0].split("=")[1]);
        LOG.fatal("" + gconf.get("ca"));
        gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")),
                Integer.parseInt(cmd.getOptionValue("w")), 100.0f);
    }
    /*
    if (cmd.hasOption("cf")) {
      DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")),
          job.getConfiguration());
    }
    */
    return job.run(true) ? 0 : -1;
}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void main(String[] args) {
    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {// w ww .j  a  v a2 s  . co  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user want to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        // this is to cause Bixo to block waiting for next time it can fetch from a particular site.
        // todo: may not be necessary in future versions of Bixo
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.JDBCCrawlTool.java

License:Apache License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from ww  w. j av  a 2 s.c  o m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain != null) {
        validateDomain(domain, parser);
    }
    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            if (domain == null) {
                System.err.println("For a new crawl the domain needs to be specified" + domain);
                printUsageAndExit(parser);
            }
            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.JDBCCrawlTool.java

License:Open Source License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from w ww .  jav a  2s .  co m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain.startsWith("http")) {
        System.err.println(
                "The target domain should be specified as just the host, without the http protocol: " + domain);
        printUsageAndExit(parser);
    }

    if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) {
        System.err.println(
                "The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
        printUsageAndExit(parser);
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        BaseUrlFilter urlFilter = new DomainUrlFilter(domain);

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.webmining.DemoWebMiningTool.java

License:Apache License

static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName) throws Exception {

    // Check if we already have a crawldb
    Path crawlDbPath = null;/*ww w. j  a va 2  s.  co m*/
    Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
    if (loopDirPath != null) {
        // Clear out any previous loop directory, so we're always starting from scratch
        LOGGER.info("deleting existing working dir");
        while (loopDirPath != null) {
            fs.delete(loopDirPath, true);
            loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        }
    }

    // Create a "0-<timestamp>" loop sub-directory and import the seed urls
    loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0);
    crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    DemoWebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName);

}

From source file:bixo.examples.webmining.WebMiningTool.java

License:Apache License

static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName) throws Exception {

    // Check if we already have a crawldb
    Path crawlDbPath = null;//  w  ww.ja  v  a 2  s .com
    Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
    if (loopDirPath != null) {
        // Clear out any previous loop directory, so we're always starting from scratch
        LOGGER.info("deleting existing working dir");
        while (loopDirPath != null) {
            fs.delete(loopDirPath, true);
            loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        }
    }

    // Create a "0-<timestamp>" loop sub-directory and import the seed urls
    loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0);
    crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    WebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName);

}

From source file:boa.datagen.SeqSortMerge.java

License:Apache License

public static void main(String[] args) throws IOException {
    conf.set("fs.default.name", base);
    FileSystem fs = FileSystem.get(conf);

    String inPath = "/tmprepcache/2015-07-sorted/";
    while (true) {
        FileStatus[] files = fs.listStatus(new Path(inPath));
        if (files.length < 2)
            break;
        Path path = new Path(inPath + System.currentTimeMillis());
        fs.mkdirs(path);//from  ww  w. j a  v  a 2  s .c  o  m
        SequenceFile.Writer w = SequenceFile.createWriter(fs, conf,
                new Path(inPath + path.getName() + "/part-00000"), Text.class, BytesWritable.class);
        FileStatus[] candidates = getCandidates(files);
        System.out.println("Merging " + candidates.length + " from " + files.length);
        SequenceFile.Reader[] readers = new SequenceFile.Reader[candidates.length];
        for (int i = 0; i < candidates.length; i++)
            readers[i] = new SequenceFile.Reader(fs,
                    new Path(inPath + candidates[i].getPath().getName() + "/part-00000"), conf);
        Text[] keys = new Text[candidates.length];
        BytesWritable[] values = new BytesWritable[candidates.length];
        read(readers, keys, values);
        while (true) {
            int index = min(keys);
            if (keys[index].toString().isEmpty())
                break;
            w.append(keys[index], values[index]);
            read(readers[index], keys[index], values[index]);
        }
        for (int i = 0; i < readers.length; i++)
            readers[i].close();
        w.close();
        for (int i = 0; i < readers.length; i++)
            fs.delete(new Path(inPath + candidates[i].getPath().getName()), true);
    }
}

From source file:boostingPL.driver.SAMMEPLDriver.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    int status = commandAnalysis(args);
    if (status != 0) {
        return status;
    }//w w  w  .  j  av  a2  s  .c  om

    @SuppressWarnings("deprecation")
    Job job = new Job(getConf());
    job.setJobName("SAMMEPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " "
            + numLinesPerMap + " " + numIterations);
    job.setJarByClass(SAMMEPLDriver.class);

    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, dataPath);
    NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap);
    FileSystem fs = modelPath.getFileSystem(getConf());
    if (fs.exists(modelPath)) {
        fs.delete(modelPath, true);
    }
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, modelPath);

    if (runModel.equals("train")) {
        job.setMapperClass(AdaBoostPLMapper.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(ClassifierWritable.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(ClassifierWritable.class);
    } else {
        job.setMapperClass(AdaBoostPLTestMapper.class);
        job.setReducerClass(AdaBoostPLTestReducer.class);
        job.setOutputFormatClass(NullOutputFormat.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
    }

    Configuration conf = job.getConfiguration();
    conf.set("BoostingPL.boostingName", "SAMME");
    conf.set("BoostingPL.numIterations", String.valueOf(numIterations));
    conf.set("BoostingPL.modelPath", modelPath.toString());
    if (metadataPath == null) {
        conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata");
    } else {
        conf.set("BoostingPL.metadata", metadataPath.toString());
    }
    if (outputFolder != null) {
        conf.set("BoostingPL.outputFolder", outputFolder.toString());
    }

    LOG.info(StringUtils.arrayToString(args));
    return job.waitForCompletion(true) == true ? 0 : -1;
}

From source file:br.com.lassal.nqueens.grid.job.NQueenCounter.java

/**
 * Forma de chamada/*w  w w . j  a va2s  . com*/
 * <> {numero de rainhas} {diretorio raiz} -F
 *
 * @param strings
 * @return
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    // Configuration processed by ToolRunner
    Configuration conf = getConf();

    // Create a JobConf using the processed conf
    Job job = new Job(conf, "nqueens-counter");
    job.setJarByClass(NQueenCounter.class);

    int queensNumber = Integer.parseInt(args[0]);
    String workingFolder = args.length >= 2 ? args[1] : null;
    boolean isFinal = args.length >= 3 && "-F".equals(args[2]) ? true : false;

    Path sourcePath = this.setWorkingFolder(queensNumber, workingFolder, isFinal, job);
    job.setOutputKeyClass(org.apache.hadoop.io.Text.class);
    job.setOutputValueClass(org.apache.hadoop.io.Text.class);

    if (isFinal) {
        job.setMapperClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterResultMapper.class);
        job.setReducerClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterResultReducer.class);
    } else {
        job.setMapperClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterMapper.class);
        job.setReducerClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterReducer.class);
    }

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);

    if (sourcePath != null) {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(sourcePath, true);
    }

    return result ? 0 : 1;

}

From source file:bulkload.ImportTsv.java

License:Apache License

/**
 * Sets up the actual job.//from  ww w.jav a 2 s .c o m
 * 
 * @param conf
 *            The current configuration.
 * @param args
 *            The command line parameters.
 * @return The newly created job.
 * @throws IOException
 *             When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {

    Job job = null;
    try (Connection connection = ConnectionFactory.createConnection(conf)) {
        try (Admin admin = connection.getAdmin()) {
            // Support non-XML supported characters
            // by re-encoding the passed separator as a Base64 string.
            String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
            if (actualSeparator != null) {
                conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes()));
            }
            TableName tableName = TableName.valueOf(args[0]);
            if (!admin.tableExists(tableName)) {
                String errorMsg = format("Table '%s' does not exist.", tableName);
                LOG.error(errorMsg);
                throw new TableNotFoundException(errorMsg);
            }
            Path inputDir = new Path(args[1]);
            String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName.getNameAsString());
            job = Job.getInstance(conf, jobName);
            job.setJarByClass(TsvImporter.class);
            FileInputFormat.setInputPaths(job, inputDir);
            job.setInputFormatClass(TextInputFormat.class);
            job.setMapperClass(TsvImporter.class);

            String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
            if (hfileOutPath != null) {
                try (HTable table = (HTable) connection.getTable(tableName)) {
                    Path outputDir = new Path(hfileOutPath);
                    FileSystem fs = FileSystem.get(conf);
                    if (fs.exists(outputDir)) {
                        if (!fs.delete(outputDir, true)) {
                            throw new IllegalStateException("delete path:" + outputDir + " failed");
                        }
                    }
                    FileOutputFormat.setOutputPath(job, outputDir);
                    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
                    job.setMapOutputValueClass(Put.class);
                    job.setReducerClass(PutSortReducer.class);
                    HFileOutputFormat2.configureIncrementalLoad(job, table, table);
                }
            } else {
                // No reducers. Just write straight to table. Call
                // initTableReducerJob
                // to set up the TableOutputFormat.
                TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job);
                job.setNumReduceTasks(0);

                //               TableMapReduceUtil.addDependencyJars(job);
                //               TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
                //                     com.google.common.base.Function.class /* Guava used by TsvParser */);
            }

            // Workaround to remove unnecessary hadoop dependencies
            String[] jars = job.getConfiguration().get("tmpjars").split(",", -1);
            StringBuilder filteredJars = new StringBuilder();
            for (String j : jars) {
                String[] parts = j.split("/", -1);
                String fileName = parts[parts.length - 1];
                if (fileName.indexOf("hadoop-") != 0) {
                    filteredJars.append(j);
                    filteredJars.append(",");
                }
            }
            job.getConfiguration().set("tmpjars", filteredJars.toString());
        }
    }

    return job;
}