Example usage for org.apache.hadoop.fs FileSystem exists

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem exists.

Prototype

public boolean exists(Path f) throws IOException

Source Link

Document

Check if a path exists.

Usage

From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java

@Override
public int run(String[] strings) throws Exception {

    GiraphConfiguration gconf = new GiraphConfiguration(conf);
    //gconf.setVertexClass(SimpleShortestPathsComputation.class);
    /*gconf.setVertexInputFormatClass(
        SimpleShortestPathsVertexInputFormat.class);
    gconf.setVertexOutputFormatClass(/*  w w w.  j  a  va 2  s .c  o  m*/
        SimpleShortestPathsVertexOutputFormat.class);
    */
    CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings);
    if (null == cmd) {
        return 0;
    }

    //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName());
    GiraphJob job = new GiraphJob(gconf, getClass().getName());
    job.getInternalJob().setJarByClass(getClass());
    if (cmd.hasOption("vof") || cmd.hasOption("eof")) {
        if (cmd.hasOption("op")) {
            Path outputPath = new Path(cmd.getOptionValue("op"));

            FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
            /*Check if output path (args[1])exist or not*/
            if (fs.exists(outputPath)) {
                /*If exist delete the output path*/
                fs.delete(outputPath, true);
            }

            FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath);
        }
    }
    /*
    if (cmd.hasOption("vif") || cmd.hasOption("eif")) {
      if (cmd.hasOption("vip")) {
          FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op")));
      }
    }*/
    //If there is a custom option specified
    if (cmd.hasOption("ca")) {
        String[] args = cmd.getOptionValues("ca");
        LOG.fatal("" + Arrays.toString(args));

        gconf.set("ca", args[0].split("=")[1]);
        LOG.fatal("" + gconf.get("ca"));
        gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")),
                Integer.parseInt(cmd.getOptionValue("w")), 100.0f);
    }
    /*
    if (cmd.hasOption("cf")) {
      DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")),
          job.getConfiguration());
    }
    */
    return job.run(true) ? 0 : -1;
}

From source file:bigimp.BuildForest.java

License:Apache License

private void buildForest() throws IOException, ClassNotFoundException, InterruptedException {
    // make sure the output path does not exist
    FileSystem ofs = outputPath.getFileSystem(getConf());
    if (ofs.exists(outputPath)) {
        log.error("Output path already exists");
        return;//from w  w  w . j a  v a 2 s  . co  m
    }

    DecisionTreeBuilder treeBuilder = new DecisionTreeBuilder();
    if (m != null) {
        treeBuilder.setM(m);
    }
    treeBuilder.setComplemented(complemented);
    if (minSplitNum != null) {
        treeBuilder.setMinSplitNum(minSplitNum);
    }
    if (minVarianceProportion != null) {
        treeBuilder.setMinVarianceProportion(minVarianceProportion);
    }

    Builder forestBuilder;

    if (isPartial) {
        log.info("Partial Mapred implementation");
        forestBuilder = new PartialBuilder(treeBuilder, dataPath, datasetPath, seed, getConf());
    } else {
        log.info("InMem Mapred implementation");
        forestBuilder = new InMemBuilder(treeBuilder, dataPath, datasetPath, seed, getConf());
    }

    forestBuilder.setOutputDirName(outputPath.getName());

    log.info("Building the forest...");
    long time = System.currentTimeMillis();

    DecisionForest forest = forestBuilder.build(nbTrees);

    time = System.currentTimeMillis() - time;
    log.info("Build Time: {}", DFUtils.elapsedTime(time));
    log.info("Forest num Nodes: {}", forest.nbNodes());
    log.info("Forest mean num Nodes: {}", forest.meanNbNodes());
    log.info("Forest mean max Depth: {}", forest.meanMaxDepth());

    // store the decision forest in the output path
    Path forestPath = new Path(outputPath, "forest.seq");
    log.info("Storing the forest in: {}", forestPath);
    DFUtils.storeWritable(getConf(), forestPath, forest);
}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void main(String[] args) {
    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  w  w w .  ja  va  2  s.  c om
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user want to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        // this is to cause Bixo to block waiting for next time it can fetch from a particular site.
        // todo: may not be necessary in future versions of Bixo
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.DemoStatusTool.java

License:Apache License

public static void main(String[] args) {
    DemoStatusToolOptions options = new DemoStatusToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*www  . j a  v  a 2  s . co  m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    String crawlDirName = options.getWorkingDir();

    try {
        JobConf conf = new JobConf();
        Path crawlDirPath = new Path(crawlDirName);
        FileSystem fs = crawlDirPath.getFileSystem(conf);

        if (!fs.exists(crawlDirPath)) {
            System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
            System.exit(-1);
        }

        // Skip Hadoop/Cascading DEBUG messages.
        Logger.getRootLogger().setLevel(Level.INFO);

        boolean exportDb = options.isExportDb();
        if (exportDb) {
            Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
            processCrawlDb(conf, latestCrawlDirPath, exportDb);
        } else {
            int prevLoop = -1;
            Path curDirPath = null;
            while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.toUri().toString();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");

                int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                if (curLoop != prevLoop + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                }

                prevLoop = curLoop;

                // Process the status and crawldb in curPath
                processStatus(conf, curDirPath);
                processCrawlDb(conf, curDirPath, exportDb);

            }
        }
    } catch (Throwable t) {
        LOGGER.error("Exception running tool", t);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.JDBCCrawlTool.java

License:Apache License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//w w  w  . j  ava  2  s  . co m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain != null) {
        validateDomain(domain, parser);
    }
    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            if (domain == null) {
                System.err.println("For a new crawl the domain needs to be specified" + domain);
                printUsageAndExit(parser);
            }
            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.crawl.JDBCCrawlWorkflow.java

License:Apache License

public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent,
        FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug,
        String persistentDbLocation) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);

    FileSystem fs = curLoopDirPath.getFileSystem(conf);

    if (!fs.exists(inputDir)) {
        throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir));
    }//  w  w  w . jav a2  s  .c om

    Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation);

    // Read _everything_ in initially
    // Group on the url, and select the best urls to best
    Pipe importPipe = new Pipe("url importer");
    importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD));
    importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS);

    Path contentPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString());

    Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString());

    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // NOTE: The source and sink for CrawlDbDatums is essentially the same database -
    // since cascading doesn't allow you to use the same tap for source and 
    // sink we fake it by creating two separate taps.
    Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation);

    // Create the sub-assembly that runs the fetch job
    BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers);

    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());

    // Take content and split it into content output plus parse to extract URLs.
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser());
    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe,
            new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));

    // Take status and output updated UrlDatum's. Again, since we are using
    // the same database we need to create a new tap.
    Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe());
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());

    // Now we need to join the URLs we get from parsing content with the
    // URLs we got from the status output, so we have a unified stream
    // of all known URLs.
    Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe),
            new Fields(UrlDatum.URL_FN));
    urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(outputPipe.getName(), urlSink);

    // Finally we can run it.
    FlowConnector flowConnector = new FlowConnector(
            HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf));
    return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(),
            parsePipe.getTailPipe(), outputPipe);

}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }/*from   ww w . j ava2  s.co m*/
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }/*from  w  w w. j  a  va2 s .  c  o m*/
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:bixo.examples.crawl.SimpleCrawlTool.java

License:Apache License

public static void main(String[] args) {
    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  w w w  .j a va  2s.  co m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /urls subdir
            // In the /urls dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        if (domain != null) {
            urlFilter = new DomainUrlFilter(domain);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.SimpleCrawlWorkflow.java

License:Apache License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(),
            conf);//  www.j a  v a 2s.  c  o m
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    Path productsDirPath = new Path(curWorkingDirPath, CrawlConfig.PRODUCTS_SUBDIR_NAME);
    Tap productsSink = new Hfs(new TextLine(), productsDirPath.toString());
    // Tap productsSink = new Hfs(new TextLine(ProductDatum.FIELDS), productsDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you 
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the 
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    SimpleParser parser = new SimpleParser();
    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Pipe productsPipe = new Pipe("products pipe", parsePipe);
    // PRECIOUS Pipe productsPipe = new Pipe("products pipe", fetchPipe.getContentTailPipe());
    String regex = "[a-z]+@[a-z]+.[a-z]+";
    // WAS: String regex = "[\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Z]{2,4}";
    Function emailExtractor = new RegexGenerator(new Fields("email"), regex);
    productsPipe = new Each(productsPipe, emailExtractor);
    // PRECIOUS productsPipe = new Each(productsPipe, new CreateProductDatumsFunction());
    productsPipe = TupleLogger.makePipe(productsPipe, true);

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
    if (urlFilter != null) {
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    }
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch");
    urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink);
    sinkMap.put(productsPipe.getName(), productsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(),
            outputPipe);

    return flow;
}