Example usage for org.apache.hadoop.fs Path toString

List of usage examples for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java

License:Apache License

@Override
public void setup(Context context) throws IOException {
    try {//www . jav  a2s .  co m
        Configuration conf = context.getConfiguration();

        minSup = conf.getInt(MIN_SUP_KEY, -1);
        prefixLength = conf.getInt(PREFIX_LENGTH_KEY, 1);

        Path[] localCacheFiles = getLocalCacheFiles(conf);

        for (Path path : localCacheFiles) {
            String pathString = path.toString();
            if (pathString.contains(OSingletonsTids)) {
                System.out.println("[PrefixComputerMapper]: Reading singletons");
                singletons = readTidLists(conf, path);
            } else if (pathString.contains(OSingletonsOrder)) {
                System.out.println("[PrefixComputerMapper]: Reading singleton orders");
                orderMap = readSingletonsOrder(path);
            }
        }

        sortSingletons();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:be.uantwerpen.adrem.disteclat.PrefixComputerMapper.java

License:Apache License

/**
 * Reads the singletons ordering from file.
 * //from  ww w. ja  va  2s. c o  m
 * @param path
 * @return
 * @throws IOException
 */
private static Map<Integer, Integer> readSingletonsOrder(Path path) throws IOException {
    BufferedReader reader = new BufferedReader(new FileReader(path.toString()));

    String order = reader.readLine().trim();
    reader.close();

    Map<Integer, Integer> orderMap = newHashMap();
    String[] split = order.split(" ");
    int ix = 0;
    for (String item : split) {
        orderMap.put(valueOf(item), ix++);
    }
    return orderMap;
}

From source file:be.uantwerpen.adrem.hadoop.util.Tools.java

License:Apache License

@SuppressWarnings("rawtypes")
public static Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat,
        Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey,
        Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer,
        Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue,
        Class<? extends OutputFormat> outputFormat) throws IOException {
    Job job = new Job(new Configuration());

    Configuration jobConf = job.getConfiguration();

    if (reducer.equals(Reducer.class)) {
        if (mapper.equals(Mapper.class)) {
            throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer");
        }//from w w w  . jav a 2s.c  o m
        job.setJarByClass(mapper);
    } else {
        job.setJarByClass(reducer);
    }

    job.setInputFormatClass(inputFormat);
    jobConf.set("mapred.input.dir", inputPath.toString());

    job.setMapperClass(mapper);
    if (mapperKey != null) {
        job.setMapOutputKeyClass(mapperKey);
    }
    if (mapperValue != null) {
        job.setMapOutputValueClass(mapperValue);
    }

    jobConf.setBoolean("mapred.compress.map.output", true);

    job.setReducerClass(reducer);
    job.setOutputKeyClass(reducerKey);
    job.setOutputValueClass(reducerValue);

    job.setOutputFormatClass(outputFormat);
    jobConf.set("mapred.output.dir", outputPath.toString());

    return job;
}

From source file:bigfat.hadoop.HDFSDirInputStream.java

License:Apache License

/**
 * Create a input stream that will read through all the files in one
 * directory note that the file will be sorted by name, using the
 * comparator.// w ww. ja  va 2  s  .  com
 * 
 * @param fs
 * @param dir
 * @param comp
 * @throws IOException
 */
public HDFSDirInputStream(FileSystem fs, String dir, Comparator<String> comp) throws IOException {
    this.fs = fs;
    Path p = new Path(dir);
    FileStatus fstate = fs.getFileStatus(p);
    if (fstate.isDir()) {
        FileStatus[] child = fs.globStatus(new Path(dir + "/*"));
        LinkedList<String> s = new LinkedList<String>();
        Map<String, Path> map = new HashMap<String, Path>();
        for (FileStatus c : child) {
            if (c.isDir())
                continue;
            map.put(c.getPath().getName(), c.getPath());
            s.add(c.getPath().getName());
        }
        if (comp != null)
            Collections.sort(s, comp);
        else
            Collections.sort(s);
        Iterator<String> it = s.iterator();
        while (it.hasNext()) {
            String n = it.next();
            Path pr = map.get(n);
            this.appendFile(pr.toString());
        }
    } else {
        this.appendFile(dir);
    }
}

From source file:bixo.examples.crawl.DemoCrawlWorkflowLRTest.java

License:Apache License

@Test
public void testNotLosingFetchedUrls() throws Throwable {
    String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output";
    JobConf conf = new JobConf();
    Path baseDirPath = new Path(baseDirName);
    FileSystem fs = baseDirPath.getFileSystem(conf);

    HadoopUtils.safeRemove(fs, baseDirPath);
    Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0);
    Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

    DemoCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf);
    curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1);

    FetcherPolicy defaultPolicy = new FetcherPolicy();
    defaultPolicy.setCrawlDelay(1);//  ww  w .j  a  v a  2s .  c  o m
    defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
    BaseUrlFilter urlFilter = new BaseUrlFilter() {

        @Override
        public boolean isRemove(UrlDatum datum) {
            return false;
        }
    };

    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    options.setUseBoilerpipe(true);
    UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
    Server server = null;
    try {
        server = startServer(new FakeWebSiteHandler(), 8089);
        Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                urlFilter, options);
        flow.complete();

        // Update the crawlDb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Now we should have an output/1-<timestamp>/ directory, where the
        // /urls dir has 11 entries with
        // one being previously crawled, and the other 10 being pending.

        Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        TupleEntryIterator iter = crawldbTap.openForRead(conf);

        int numFetched = 0;
        int numPending = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int crawlDepth = datum.getCrawlDepth();
            if (datum.getLastFetched() != 0) {
                numFetched += 1;

                assertEquals(UrlStatus.FETCHED, status);
                assertEquals(0, crawlDepth);
            } else {
                numPending += 1;
                assertEquals(UrlStatus.UNFETCHED, status);
                assertEquals(1, crawlDepth);
            }
        }

        assertEquals(1, numFetched);
        assertEquals(10, numPending);

        // Do it one more time, to verify status gets propagated forward.
        curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2);

        flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter,
                options);
        flow.complete();
        // Update crawldb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        iter = crawldbTap.openForRead(conf);

        numFetched = 0;
        numPending = 0;
        int numDepth0 = 0;
        int numDepth1 = 0;
        int numDepth2 = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int depth = datum.getCrawlDepth();

            if (datum.getLastFetched() != 0) {
                numFetched += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
            } else {
                numPending += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
            }

            if (depth == 0) {
                numDepth0 += 1;
            } else if (depth == 1) {
                numDepth1 += 1;
            } else if (depth == 2) {
                numDepth2 += 1;
            } else {
                fail("Invalid crawl depth for " + datum.getUrl());
            }

            // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
            // datum.getUrl(), datum.getLastStatus(),
            // datum.getLastFetched(), depth));
        }

        assertEquals(11, numFetched);
        assertEquals(100, numPending);

        assertEquals(1, numDepth0);
        assertEquals(10, numDepth1);
        assertEquals(100, numDepth2);
    } catch (Throwable t) {
        fail(t.getMessage());
    } finally {
        if (server != null) {
            server.stop();
        }
    }

}

From source file:bixo.examples.crawl.JDBCCrawlWorkflow.java

License:Apache License

public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent,
        FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug,
        String persistentDbLocation) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);

    FileSystem fs = curLoopDirPath.getFileSystem(conf);

    if (!fs.exists(inputDir)) {
        throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir));
    }/*from   w w w  .j  a  va  2  s  .c o  m*/

    Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation);

    // Read _everything_ in initially
    // Group on the url, and select the best urls to best
    Pipe importPipe = new Pipe("url importer");
    importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD));
    importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS);

    Path contentPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString());

    Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString());

    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // NOTE: The source and sink for CrawlDbDatums is essentially the same database -
    // since cascading doesn't allow you to use the same tap for source and 
    // sink we fake it by creating two separate taps.
    Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation);

    // Create the sub-assembly that runs the fetch job
    BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers);

    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());

    // Take content and split it into content output plus parse to extract URLs.
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser());
    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe,
            new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));

    // Take status and output updated UrlDatum's. Again, since we are using
    // the same database we need to create a new tap.
    Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe());
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());

    // Now we need to join the URLs we get from parsing content with the
    // URLs we got from the status output, so we have a unified stream
    // of all known URLs.
    Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe),
            new Fields(UrlDatum.URL_FN));
    urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(outputPipe.getName(), urlSink);

    // Finally we can run it.
    FlowConnector flowConnector = new FlowConnector(
            HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf));
    return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(),
            parsePipe.getTailPipe(), outputPipe);

}

From source file:bixo.examples.crawl.LatestUrlDatumBufferTest.java

License:Apache License

@Test
public void testOperateWithGroupBy() throws IOException {

    // Create a temp file with a fetched url
    Path fetchedDatumsPath = new Path(_workingDirPath, "fetched");
    ArrayList<UrlDatum> fetchedDatums = new ArrayList<UrlDatum>();
    UrlDatum fetchedDatum1 = new UrlDatum("http://foo.com");
    fetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 2L);
    fetchedDatums.add(fetchedDatum1);//from  w w  w .j a  va 2  s  .com
    createDataFile(fetchedDatumsPath.toString(), fetchedDatums);

    // And another with unfetched urls
    Path unfetchedDatumsPath = new Path(_workingDirPath, "unfetched");
    ArrayList<UrlDatum> unfetchedDatums = new ArrayList<UrlDatum>();
    UrlDatum unfetchedDatum1 = new UrlDatum("http://foo.com");
    unfetchedDatum1.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
    unfetchedDatums.add(unfetchedDatum1);
    UrlDatum unfetchedDatum2 = new UrlDatum("http://foo.com");
    unfetchedDatum2.setPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD, 0L);
    unfetchedDatums.add(unfetchedDatum2);

    createDataFile(unfetchedDatumsPath.toString(), unfetchedDatums);

    // create a workflow
    Tap inputSource1 = new Hfs(new SequenceFile(UrlDatum.FIELDS), fetchedDatumsPath.toString());
    Pipe fetchedPipe = new Pipe("fetched");
    Tap inputSource2 = new Hfs(new SequenceFile(UrlDatum.FIELDS), unfetchedDatumsPath.toString());
    Pipe unfetchedPipe = new Pipe("unfetched");

    Map<String, Tap> sources = new HashMap<String, Tap>();
    sources.put(fetchedPipe.getName(), inputSource1);
    sources.put(unfetchedPipe.getName(), inputSource2);

    Path resultsPath = new Path(_workingDirPath, "results");
    Tap resultSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), resultsPath.toString(), true);

    Pipe resultsPipe = new GroupBy("results pipe", Pipe.pipes(fetchedPipe, unfetchedPipe),
            new Fields(UrlDatum.URL_FN));
    resultsPipe = new Every(resultsPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Properties props = HadoopUtils.getDefaultProperties(LatestUrlDatumBufferTest.class, false, _conf);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(sources, resultSink, resultsPipe);
    flow.complete();

    // verify that the resulting pipe has the latest tuple

    Tap testSink = new Hfs(new SequenceFile(UrlDatum.FIELDS), resultsPath.toString(), false);
    TupleEntryIterator reader = testSink.openForRead(_conf);
    int count = 0;
    long latest = 0;
    while (reader.hasNext()) {
        TupleEntry next = reader.next();
        UrlDatum datum = new UrlDatum(next);
        latest = (Long) datum.getPayloadValue(CrawlDbDatum.LAST_FETCHED_FIELD);
        count++;
    }

    assertEquals(1, count);
    assertEquals(2, latest);

}

From source file:bixo.examples.crawl.SimpleCrawlWorkflow.java

License:Apache License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(),
            conf);//from   ww w.j a v  a2  s  .  c  om
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    Path productsDirPath = new Path(curWorkingDirPath, CrawlConfig.PRODUCTS_SUBDIR_NAME);
    Tap productsSink = new Hfs(new TextLine(), productsDirPath.toString());
    // Tap productsSink = new Hfs(new TextLine(ProductDatum.FIELDS), productsDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you 
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the 
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    SimpleParser parser = new SimpleParser();
    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Pipe productsPipe = new Pipe("products pipe", parsePipe);
    // PRECIOUS Pipe productsPipe = new Pipe("products pipe", fetchPipe.getContentTailPipe());
    String regex = "[a-z]+@[a-z]+.[a-z]+";
    // WAS: String regex = "[\\w\\-]([\\.\\w])+[\\w]+@([\\w\\-]+\\.)+[A-Z]{2,4}";
    Function emailExtractor = new RegexGenerator(new Fields("email"), regex);
    productsPipe = new Each(productsPipe, emailExtractor);
    // PRECIOUS productsPipe = new Each(productsPipe, new CreateProductDatumsFunction());
    productsPipe = TupleLogger.makePipe(productsPipe, true);

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
    if (urlFilter != null) {
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    }
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch");
    urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink);
    sinkMap.put(productsPipe.getName(), productsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(),
            outputPipe);

    return flow;
}

From source file:bixo.examples.crawl.SimpleCrawlWorkflowLRTest.java

License:Apache License

@Test
public void testNotLosingFetchedUrls() throws Throwable {
    String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output";
    JobConf conf = new JobConf();
    Path baseDirPath = new Path(baseDirName);
    FileSystem fs = baseDirPath.getFileSystem(conf);

    HadoopUtils.safeRemove(fs, baseDirPath);
    Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0);
    Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

    SimpleCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf);
    curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1);

    FetcherPolicy defaultPolicy = new FetcherPolicy();
    defaultPolicy.setCrawlDelay(1);/*from w w  w.  j  a  v a 2 s  .c o  m*/
    defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
    BaseUrlFilter urlFilter = new BaseUrlFilter() {

        @Override
        public boolean isRemove(UrlDatum datum) {
            return false;
        }
    };

    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
    Server server = null;
    try {
        server = startServer(new FakeWebSiteHandler(), 8089);
        Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                urlFilter, options);
        flow.complete();

        // Update the crawlDb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Now we should have an output/1-<timestamp>/ directory, where the
        // /urls dir has 11 entries with
        // one being previously crawled, and the other 10 being pending.

        Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        TupleEntryIterator iter = crawldbTap.openForRead(conf);

        int numFetched = 0;
        int numPending = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int crawlDepth = datum.getCrawlDepth();
            if (datum.getLastFetched() != 0) {
                numFetched += 1;

                assertEquals(UrlStatus.FETCHED, status);
                assertEquals(0, crawlDepth);
            } else {
                numPending += 1;
                assertEquals(UrlStatus.UNFETCHED, status);
                assertEquals(1, crawlDepth);
            }
        }

        assertEquals(1, numFetched);
        assertEquals(10, numPending);

        // Do it one more time, to verify status gets propagated forward.
        curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2);

        flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter,
                options);
        flow.complete();
        // Update crawldb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        iter = crawldbTap.openForRead(conf);

        numFetched = 0;
        numPending = 0;
        int numDepth0 = 0;
        int numDepth1 = 0;
        int numDepth2 = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int depth = datum.getCrawlDepth();

            if (datum.getLastFetched() != 0) {
                numFetched += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
            } else {
                numPending += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
            }

            if (depth == 0) {
                numDepth0 += 1;
            } else if (depth == 1) {
                numDepth1 += 1;
            } else if (depth == 2) {
                numDepth2 += 1;
            } else {
                fail("Invalid crawl depth for " + datum.getUrl());
            }

            // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
            // datum.getUrl(), datum.getLastStatus(),
            // datum.getLastFetched(), depth));
        }

        assertEquals(11, numFetched);
        assertEquals(100, numPending);

        assertEquals(1, numDepth0);
        assertEquals(10, numDepth1);
        assertEquals(100, numDepth2);
    } catch (Throwable t) {
        fail(t.getMessage());
    } finally {
        if (server != null) {
            server.stop();
        }
    }

}

From source file:bixo.examples.JDBCCrawlWorkflow.java

License:Open Source License

public static Flow createFlow(Path inputDir, Path curLoopDirPath, UserAgent userAgent,
        FetcherPolicy fetcherPolicy, BaseUrlFilter urlFilter, int maxThreads, boolean debug,
        String persistentDbLocation) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = conf.getNumReduceTasks() * HadoopUtils.getTaskTrackers(conf);
    FileSystem fs = curLoopDirPath.getFileSystem(conf);

    if (!fs.exists(inputDir)) {
        throw new IllegalStateException(String.format("Input directory %s doesn't exist", inputDir));
    }//from   w  ww  . j  ava2 s. c  o  m

    Tap inputSource = JDBCTapFactory.createUrlsSourceJDBCTap(persistentDbLocation);

    // Read _everything_ in initially
    // Split that pipe into URLs we want to fetch for the fetch pipe
    Pipe importPipe = new Pipe("url importer");
    importPipe = new GroupBy(importPipe, new Fields(CrawlDbDatum.URL_FIELD));
    importPipe = new Every(importPipe, new BestUrlToFetchBuffer(), Fields.RESULTS);

    Path contentPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentPath.toString());

    Path parsePath = new Path(curLoopDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parsePath.toString());

    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // NOTE: The source and sink for CrawlDbDatums is essentially the same database -
    // since cascading doesn't allow you to use the same tap for source and 
    // sink we fake it by creating two separate taps.
    Tap urlSink = JDBCTapFactory.createUrlsSinkJDBCTap(persistentDbLocation);

    // Create the sub-assembly that runs the fetch job
    BaseFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(importPipe, scorer, fetcher, numReducers);

    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());

    // Take content and split it into content output plus parse to extract URLs.
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), new SimpleParser());
    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));

    // Take status and output updated UrlDatum's. Again, since we are using
    // the same database we need to create a new tap.
    Pipe urlFromFetchPipe = new Pipe("url from fetch", fetchPipe.getStatusTailPipe());
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());

    // Now we need to join the URLs we get from parsing content with the
    // URLs we got from the status output, so we have a unified stream
    // of all known URLs.
    Pipe urlPipe = new GroupBy("url pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe),
            new Fields(UrlDatum.URL_FN));
    urlPipe = new Every(urlPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(urlPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(FetchPipe.CONTENT_PIPE_NAME, contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(outputPipe.getName(), urlSink);

    // Finally we can run it.
    FlowConnector flowConnector = new FlowConnector(
            HadoopUtils.getDefaultProperties(JDBCCrawlWorkflow.class, debug, conf));
    return flowConnector.connect(inputSource, sinkMap, statusPipe, fetchPipe.getContentTailPipe(),
            parsePipe.getTailPipe(), outputPipe);

}