Example usage for org.apache.hadoop.fs FileSystem delete

List of usage examples for org.apache.hadoop.fs FileSystem delete

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem delete.

Prototype

public abstract boolean delete(Path f, boolean recursive) throws IOException;

Source Link

Document

Delete a file.

Usage

From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java

@Override
public int run(String[] strings) throws Exception {

    GiraphConfiguration gconf = new GiraphConfiguration(conf);
    //gconf.setVertexClass(SimpleShortestPathsComputation.class);
    /*gconf.setVertexInputFormatClass(
        SimpleShortestPathsVertexInputFormat.class);
    gconf.setVertexOutputFormatClass(// w w  w  .j a  va 2  s .c  o  m
        SimpleShortestPathsVertexOutputFormat.class);
    */
    CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings);
    if (null == cmd) {
        return 0;
    }

    //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName());
    GiraphJob job = new GiraphJob(gconf, getClass().getName());
    job.getInternalJob().setJarByClass(getClass());
    if (cmd.hasOption("vof") || cmd.hasOption("eof")) {
        if (cmd.hasOption("op")) {
            Path outputPath = new Path(cmd.getOptionValue("op"));

            FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
            /*Check if output path (args[1])exist or not*/
            if (fs.exists(outputPath)) {
                /*If exist delete the output path*/
                fs.delete(outputPath, true);
            }

            FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath);
        }
    }
    /*
    if (cmd.hasOption("vif") || cmd.hasOption("eif")) {
      if (cmd.hasOption("vip")) {
          FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op")));
      }
    }*/
    //If there is a custom option specified
    if (cmd.hasOption("ca")) {
        String[] args = cmd.getOptionValues("ca");
        LOG.fatal("" + Arrays.toString(args));

        gconf.set("ca", args[0].split("=")[1]);
        LOG.fatal("" + gconf.get("ca"));
        gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")),
                Integer.parseInt(cmd.getOptionValue("w")), 100.0f);
    }
    /*
    if (cmd.hasOption("cf")) {
      DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")),
          job.getConfiguration());
    }
    */
    return job.run(true) ? 0 : -1;
}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void main(String[] args) {
    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {// w ww .j  a  v a2 s  . co  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user want to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        // this is to cause Bixo to block waiting for next time it can fetch from a particular site.
        // todo: may not be necessary in future versions of Bixo
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.JDBCCrawlTool.java

License:Apache License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from ww  w. j av  a 2 s.c  o m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain != null) {
        validateDomain(domain, parser);
    }
    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            if (domain == null) {
                System.err.println("For a new crawl the domain needs to be specified" + domain);
                printUsageAndExit(parser);
            }
            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.JDBCCrawlTool.java

License:Open Source License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from w ww .  jav a  2s .  co m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain.startsWith("http")) {
        System.err.println(
                "The target domain should be specified as just the host, without the http protocol: " + domain);
        printUsageAndExit(parser);
    }

    if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) {
        System.err.println(
                "The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
        printUsageAndExit(parser);
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        BaseUrlFilter urlFilter = new DomainUrlFilter(domain);

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.webmining.DemoWebMiningTool.java

License:Apache License

static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName) throws Exception {

    // Check if we already have a crawldb
    Path crawlDbPath = null;/*ww w. j  a va 2  s.  co m*/
    Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
    if (loopDirPath != null) {
        // Clear out any previous loop directory, so we're always starting from scratch
        LOGGER.info("deleting existing working dir");
        while (loopDirPath != null) {
            fs.delete(loopDirPath, true);
            loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        }
    }

    // Create a "0-<timestamp>" loop sub-directory and import the seed urls
    loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0);
    crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    DemoWebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName);

}

From source file:bixo.examples.webmining.WebMiningTool.java

License:Apache License

static void setupWorkingDir(FileSystem fs, Path workingDirPath, String seedUrlsfileName) throws Exception {

    // Check if we already have a crawldb
    Path crawlDbPath = null;//  w  ww.ja  v  a 2  s .com
    Path loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
    if (loopDirPath != null) {
        // Clear out any previous loop directory, so we're always starting from scratch
        LOGGER.info("deleting existing working dir");
        while (loopDirPath != null) {
            fs.delete(loopDirPath, true);
            loopDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        }
    }

    // Create a "0-<timestamp>" loop sub-directory and import the seed urls
    loopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, 0);
    crawlDbPath = new Path(loopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    WebMiningWorkflow.importSeedUrls(crawlDbPath, seedUrlsfileName);

}

From source file:boa.datagen.SeqSortMerge.java

License:Apache License

public static void main(String[] args) throws IOException {
    conf.set("fs.default.name", base);
    FileSystem fs = FileSystem.get(conf);

    String inPath = "/tmprepcache/2015-07-sorted/";
    while (true) {
        FileStatus[] files = fs.listStatus(new Path(inPath));
        if (files.length < 2)
            break;
        Path path = new Path(inPath + System.currentTimeMillis());
        fs.mkdirs(path);//from  ww  w. j a  v  a 2  s .c  o  m
        SequenceFile.Writer w = SequenceFile.createWriter(fs, conf,
                new Path(inPath + path.getName() + "/part-00000"), Text.class, BytesWritable.class);
        FileStatus[] candidates = getCandidates(files);
        System.out.println("Merging " + candidates.length + " from " + files.length);
        SequenceFile.Reader[] readers = new SequenceFile.Reader[candidates.length];
        for (int i = 0; i < candidates.length; i++)
            readers[i] = new SequenceFile.Reader(fs,
                    new Path(inPath + candidates[i].getPath().getName() + "/part-00000"), conf);
        Text[] keys = new Text[candidates.length];
        BytesWritable[] values = new BytesWritable[candidates.length];
        read(readers, keys, values);
        while (true) {
            int index = min(keys);
            if (keys[index].toString().isEmpty())
                break;
            w.append(keys[index], values[index]);
            read(readers[index], keys[index], values[index]);
        }
        for (int i = 0; i < readers.length; i++)
            readers[i].close();
        w.close();
        for (int i = 0; i < readers.length; i++)
            fs.delete(new Path(inPath + candidates[i].getPath().getName()), true);
    }
}

From source file:boostingPL.driver.SAMMEPLDriver.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    int status = commandAnalysis(args);
    if (status != 0) {
        return status;
    }//w w  w  .  j  av  a2  s  .c  om

    @SuppressWarnings("deprecation")
    Job job = new Job(getConf());
    job.setJobName("SAMMEPL:" + runModel + " " + dataPath.toString() + " " + modelPath.toString() + " "
            + numLinesPerMap + " " + numIterations);
    job.setJarByClass(SAMMEPLDriver.class);

    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, dataPath);
    NLineInputFormat.setNumLinesPerSplit(job, numLinesPerMap);
    FileSystem fs = modelPath.getFileSystem(getConf());
    if (fs.exists(modelPath)) {
        fs.delete(modelPath, true);
    }
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, modelPath);

    if (runModel.equals("train")) {
        job.setMapperClass(AdaBoostPLMapper.class);

        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(ClassifierWritable.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(ClassifierWritable.class);
    } else {
        job.setMapperClass(AdaBoostPLTestMapper.class);
        job.setReducerClass(AdaBoostPLTestReducer.class);
        job.setOutputFormatClass(NullOutputFormat.class);

        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
    }

    Configuration conf = job.getConfiguration();
    conf.set("BoostingPL.boostingName", "SAMME");
    conf.set("BoostingPL.numIterations", String.valueOf(numIterations));
    conf.set("BoostingPL.modelPath", modelPath.toString());
    if (metadataPath == null) {
        conf.set("BoostingPL.metadata", dataPath.toString() + ".metadata");
    } else {
        conf.set("BoostingPL.metadata", metadataPath.toString());
    }
    if (outputFolder != null) {
        conf.set("BoostingPL.outputFolder", outputFolder.toString());
    }

    LOG.info(StringUtils.arrayToString(args));
    return job.waitForCompletion(true) == true ? 0 : -1;
}

From source file:br.com.lassal.nqueens.grid.job.NQueenCounter.java

/**
 * Forma de chamada/*w  w w . j  a va2s  . com*/
 * <> {numero de rainhas} {diretorio raiz} -F
 *
 * @param strings
 * @return
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    // Configuration processed by ToolRunner
    Configuration conf = getConf();

    // Create a JobConf using the processed conf
    Job job = new Job(conf, "nqueens-counter");
    job.setJarByClass(NQueenCounter.class);

    int queensNumber = Integer.parseInt(args[0]);
    String workingFolder = args.length >= 2 ? args[1] : null;
    boolean isFinal = args.length >= 3 && "-F".equals(args[2]) ? true : false;

    Path sourcePath = this.setWorkingFolder(queensNumber, workingFolder, isFinal, job);
    job.setOutputKeyClass(org.apache.hadoop.io.Text.class);
    job.setOutputValueClass(org.apache.hadoop.io.Text.class);

    if (isFinal) {
        job.setMapperClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterResultMapper.class);
        job.setReducerClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterResultReducer.class);
    } else {
        job.setMapperClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterMapper.class);
        job.setReducerClass(br.com.lassal.nqueens.grid.mapreduce.NQueenIncrementalCounterReducer.class);
    }

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);

    if (sourcePath != null) {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(sourcePath, true);
    }

    return result ? 0 : 1;

}

From source file:bulkload.ImportTsv.java

License:Apache License

/**
 * Sets up the actual job.//from  ww w.jav a 2 s .c o m
 * 
 * @param conf
 *            The current configuration.
 * @param args
 *            The command line parameters.
 * @return The newly created job.
 * @throws IOException
 *             When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException {

    Job job = null;
    try (Connection connection = ConnectionFactory.createConnection(conf)) {
        try (Admin admin = connection.getAdmin()) {
            // Support non-XML supported characters
            // by re-encoding the passed separator as a Base64 string.
            String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
            if (actualSeparator != null) {
                conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes()));
            }
            TableName tableName = TableName.valueOf(args[0]);
            if (!admin.tableExists(tableName)) {
                String errorMsg = format("Table '%s' does not exist.", tableName);
                LOG.error(errorMsg);
                throw new TableNotFoundException(errorMsg);
            }
            Path inputDir = new Path(args[1]);
            String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName.getNameAsString());
            job = Job.getInstance(conf, jobName);
            job.setJarByClass(TsvImporter.class);
            FileInputFormat.setInputPaths(job, inputDir);
            job.setInputFormatClass(TextInputFormat.class);
            job.setMapperClass(TsvImporter.class);

            String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
            if (hfileOutPath != null) {
                try (HTable table = (HTable) connection.getTable(tableName)) {
                    Path outputDir = new Path(hfileOutPath);
                    FileSystem fs = FileSystem.get(conf);
                    if (fs.exists(outputDir)) {
                        if (!fs.delete(outputDir, true)) {
                            throw new IllegalStateException("delete path:" + outputDir + " failed");
                        }
                    }
                    FileOutputFormat.setOutputPath(job, outputDir);
                    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
                    job.setMapOutputValueClass(Put.class);
                    job.setReducerClass(PutSortReducer.class);
                    HFileOutputFormat2.configureIncrementalLoad(job, table, table);
                }
            } else {
                // No reducers. Just write straight to table. Call
                // initTableReducerJob
                // to set up the TableOutputFormat.
                TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job);
                job.setNumReduceTasks(0);

                //               TableMapReduceUtil.addDependencyJars(job);
                //               TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
                //                     com.google.common.base.Function.class /* Guava used by TsvParser */);
            }

            // Workaround to remove unnecessary hadoop dependencies
            String[] jars = job.getConfiguration().get("tmpjars").split(",", -1);
            StringBuilder filteredJars = new StringBuilder();
            for (String j : jars) {
                String[] parts = j.split("/", -1);
                String fileName = parts[parts.length - 1];
                if (fileName.indexOf("hadoop-") != 0) {
                    filteredJars.append(j);
                    filteredJars.append(",");
                }
            }
            job.getConfiguration().set("tmpjars", filteredJars.toString());
        }
    }

    return job;
}