Example usage for org.apache.hadoop.fs Path getFileSystem

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getFileSystem.

Prototype

public FileSystem getFileSystem(Configuration conf) throws IOException

Source Link

Document

Return the FileSystem that owns this Path.

Usage

From source file:com.finderbots.miner.MinerTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    MinerOptions options = new MinerOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from   ww  w  .java 2 s.  c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, options.getUrlsFile());

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        String crawlUrlFiltersFile = options.getRegexUrlFiltersFile();
        List<String> crawlUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(crawlUrlFiltersFile);
        BaseUrlFilter crawlUrlFilter = new RegexUrlFilter(
                crawlUrlPatterns.toArray(new String[crawlUrlPatterns.size()]));

        // setting up a miner filter we will mine only pages that match one of the urls
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        List<String> mineUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        BaseUrlFilter mineUrlFilter = new RegexUrlFilter(
                mineUrlPatterns.toArray(new String[mineUrlPatterns.size()]));

        // Let's limit our crawl to two loops
        for (int curLoop = 1; curLoop <= options.getNumLoops(); curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = MinerWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy,
                    userAgent, options, crawlUrlFilter, mineUrlFilter);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner.MinerWorkflow.java

License:Apache License

public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException {

    SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
    JobConf defaultJobConf = HadoopUtils.getDefaultJobConf();

    InputStream is = null;/*ww  w. j  a  v  a  2  s.  co  m*/
    TupleEntryCollector writer = null;
    try {
        Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true);
        writer = urlSink.openForWrite(defaultJobConf);

        //gak this should be an HDFS file of seeds we iterate through
        is = MinerWorkflow.class.getResourceAsStream(fileName);

        if (is == null) {
            throw new FileNotFoundException("The seed urls file doesn't exist");
        }

        //read all the lines from the hadoop file
        List<String> lines = IOUtils.readLines(is);
        for (String line : lines) {
            line = line.trim();
            if (line.startsWith("#")) {
                continue;
            }

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f,
                    0.0f);
            writer.add(datum.getTuple());
        }

        writer.close();
    } catch (IOException e) {
        HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath);
        throw e;
    } finally {
        IoUtils.safeClose(is);
        if (writer != null) {
            writer.close();
        }
    }

}

From source file:com.finderbots.miner.MinerWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, MinerOptions options, BaseUrlFilter crawlUrlFilter, BaseUrlFilter mineUrlFilter)
        throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(MinerWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }//from  ww w.  j a  va 2 s  . c  o m

    //Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString());
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString(), true);
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    //add a regex url filter to filter outlinks
    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());
    if (crawlUrlFilter != null) {
        outlinksPipe = new Each(outlinksPipe, new UrlFilter(crawlUrlFilter));
    }

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}

From source file:com.finderbots.miner.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file sysytem
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }//w  ww .  j  a  v  a2  s  .co m
        }
        in.close();
    }
    return filterList;
}

From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*w w w . j a v a2s . co  m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        AnalyzeHtml analyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            analyzer = new AnalyzeHtml(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, analyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerWorkflow.java

License:Apache License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, AnalyzeHtml analyzer,
        PinterestCrawlAndMinerTool.Options options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    Properties props = HadoopUtils.getDefaultProperties(PinterestCrawlAndMinerWorkflow.class,
            options.isDebugLogging(), conf);
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath);
    }//from ww  w  . j  av a 2  s .  c  o  m

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining
    // always pass in true!!!
    SimpleParser parser;
    if (options.isUseBoilerpipe()) {
        parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(),
                new ParserPolicy());
    } else if (options.isGenerateHTML()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else if (options.isEnableMiner()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else {
        parser = new SimpleParser();
    }

    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Tap writableSeqFileSink = null;
    Pipe writableSeqFileDataPipe = null;

    // Create the output map that connects each tail pipe to the appropriate sink, and the
    // list of tail pipes.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    List<Pipe> tailPipes = new ArrayList<Pipe>();

    if (options.isGenerateHTML()) {
        // Let's write out the parse as text:
        Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN),
                new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true),
                Fields.REPLACE);
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN),
                new Identity());
        Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
        Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true);
        sinkMap.put(textParsePipe.getName(), textParseTap);
        tailPipes.add(textParsePipe);
    }

    if (options.isEnableMiner()) {
        Pipe analyzerPipe = new Pipe("analyzer pipe", parsePipe.getTailPipe());
        analyzerPipe = new Each(analyzerPipe, analyzer);

        Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
        resultsPipe = new Each(resultsPipe, new CreateBooleanPreferenceFunction());

        Path minerOutputPath = new Path(curWorkingDirPath, CrawlConfig.MINER_SUBDIR_NAME);
        Tap minerOutputTap = new Hfs(new TextLine(), minerOutputPath.toString(), true);
        sinkMap.put(resultsPipe.getName(), minerOutputTap);
        tailPipes.add(resultsPipe);
    }

    // Let's output a WritableSequenceFile as an example - this file can
    // then be used as input when working with Mahout.
    writableSeqFileDataPipe = new Pipe("writable seqfile data",
            new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));

    Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
    writableSeqFileSink = new Hfs(new WritableSequenceFile(
            new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN),
            Text.class, Text.class), writableSeqFileDataPath.toString());

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe,
            new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
    if (urlFilter != null) {
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    }

    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    sinkMap.put(statusPipe.getName(), statusSink);
    tailPipes.add(statusPipe);

    sinkMap.put(contentPipe.getName(), contentSink);
    tailPipes.add(contentPipe);

    sinkMap.put(parsePipe.getTailPipe().getName(), parseSink);
    tailPipes.add(parsePipe.getTailPipe());

    sinkMap.put(outputPipe.getName(), loopCrawldbSink);
    tailPipes.add(outputPipe);

    sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
    tailPipes.add(writableSeqFileDataPipe);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()]));

    return flow;
}

From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {//w w  w  . ja va  2  s. com
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        MineRTCriticsPreferences prefsAnalyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, prefsAnalyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerWorkflow.java

License:Apache License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, MineRTCriticsPreferences prefsAnalyzer,
        RTCriticsCrawlAndMinerTool.Options options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    Properties props = HadoopUtils.getDefaultProperties(RTCriticsCrawlAndMinerWorkflow.class,
            options.isDebugLogging(), conf);
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath);
    }/*ww w. j a  v a  2 s . c  o m*/

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining
    // always pass in true!!!
    SimpleParser parser;
    if (options.isUseBoilerpipe()) {
        parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(),
                new ParserPolicy());
    } else if (options.isGenerateHTML()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else if (options.isEnableMiner()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else {
        parser = new SimpleParser();
    }

    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Tap writableSeqFileSink = null;
    Pipe writableSeqFileDataPipe = null;

    // Create the output map that connects each tail pipe to the appropriate sink, and the
    // list of tail pipes.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    List<Pipe> tailPipes = new ArrayList<Pipe>();

    if (options.isGenerateHTML()) {
        // Let's write out the parse as text:
        Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN),
                new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true),
                Fields.REPLACE);
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN),
                new Identity());
        Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
        Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true);
        sinkMap.put(textParsePipe.getName(), textParseTap);
        tailPipes.add(textParsePipe);
    }

    if (options.isEnableMiner()) { //all the miner assembly happens here
        // analyze all pages that are to be mined, create an RTPageDatum
        // that will have data for /m/ page OR /critic/ page but not both
        // todo: in a perfect world there would be two datum types and we would
        // split them before analysis but it's nice to have all anaylysis in a single
        // function--maybe?
        Pipe prefsAnalyzerPipe = new Pipe("RT critics analyzer pipe", parsePipe.getTailPipe());
        prefsAnalyzerPipe = new Each(prefsAnalyzerPipe, prefsAnalyzer);

        // take all RTPageDatum, create a text line TSV then write to a output Tap
        Pipe prefsPipe = new Pipe("prefs pipe", prefsAnalyzerPipe);
        prefsPipe = new Each(prefsPipe, new CreateRTCriticsPrefsFunction());
        //todo, should we run through Unique?
        Path outPrefsPath = new Path(curWorkingDirPath, "prefs");
        Tap outPrefsTap = new Hfs(new TextLine(), outPrefsPath.toString(), true);
        sinkMap.put(prefsPipe.getName(), outPrefsTap);
        tailPipes.add(prefsPipe);

        // take all RTPageDatum, filter out all but /m/ pages
        // make sure they are unique, create a TSV line per datum,
        // write to an output Tap
        Pipe filterMedia = new Pipe("filter_out_all_but_media_datum", prefsAnalyzerPipe);
        filterMedia = new Each(filterMedia, new FilterMediaDatumFunction());
        Pipe mediaPipe = new Pipe("create_media_records", filterMedia);
        mediaPipe = new Each(mediaPipe, new CreateRTMediaRecordsFunction());
        Pipe uniqueMedia = new Unique("uniquify_media_records", mediaPipe, new Fields("line"));

        Path outMediaPath = new Path(curWorkingDirPath, "media");
        Tap outMediaTap = new Hfs(new TextLine(), outMediaPath.toString(), true);
        sinkMap.put(uniqueMedia.getName(), outMediaTap);
        tailPipes.add(uniqueMedia);

    }

    // Let's output a WritableSequenceFile as an example - this file can
    // then be used as input when working with Mahout.
    writableSeqFileDataPipe = new Pipe("writable seqfile data",
            new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));

    Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
    writableSeqFileSink = new Hfs(new WritableSequenceFile(
            new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN),
            Text.class, Text.class), writableSeqFileDataPath.toString());

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe,
            new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
    if (urlFilter != null) {
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    }

    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    sinkMap.put(statusPipe.getName(), statusSink);
    tailPipes.add(statusPipe);

    sinkMap.put(contentPipe.getName(), contentSink);
    tailPipes.add(contentPipe);

    sinkMap.put(parsePipe.getTailPipe().getName(), parseSink);
    tailPipes.add(parsePipe.getTailPipe());

    sinkMap.put(outputPipe.getName(), loopCrawldbSink);
    tailPipes.add(outputPipe);

    sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
    tailPipes.add(writableSeqFileDataPipe);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()]));

    return flow;
}

From source file:com.finderbots.utilities.ExportPinterestPrefsTool.java

License:Apache License

public static void main(String[] args) {
    ExportToolOptions options = new ExportToolOptions();
    CmdLineParser parser = new CmdLineParser(options);
    String outputDirName;/*from  w  w  w .j a v a 2  s . com*/

    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    try {
        outputDirName = options.getOutputDir();
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        Path crawlPath = new Path(options.getCrawlDir());
        FileSystem fs = outputPath.getFileSystem(conf);

        // get the urls of users, urls of followed people, make sure they are unique, create an index
        // and write the ids out as CSV file of prefs for mahout input.
        Flow exportPinterestPrefsWorkFlow = ExportPinterestPrefsWorkflow.createFlow(crawlPath, options);
        exportPinterestPrefsWorkFlow.complete();
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.utilities.ExportTool.java

License:Apache License

public static void main(String[] args) {
    ExportToolOptions options = new ExportToolOptions();
    CmdLineParser parser = new CmdLineParser(options);
    String outputDirName;/*from w ww  . j  a va2  s  .c o m*/

    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    try {
        outputDirName = options.getOutputDir();
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        Path crawlPath = new Path(options.getCrawlDir());
        FileSystem fs = outputPath.getFileSystem(conf);

        // create a flow that takes all parsed text and accumulates into a single sink in mahout format
        Flow exportToMahoutFlow = ExportAllToMahoutWorkflow.createFlow(crawlPath, options);
        exportToMahoutFlow.complete();
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}