Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.facebook.presto.hive.util.InternalHiveSplitFactory.java

License:Apache License

private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations,
        long start, long length, long fileSize, OptionalInt bucketNumber, boolean splittable) {
    String pathString = path.toString();
    if (!pathMatchesPredicate(pathDomain, pathString)) {
        return Optional.empty();
    }/*from   w  ww .  j a va 2 s  . c  om*/

    boolean forceLocalScheduling = this.forceLocalScheduling;

    // For empty files, some filesystem (e.g. LocalFileSystem) produce one empty block
    // while others (e.g. hdfs.DistributedFileSystem) produces no block.
    // Synthesize an empty block if one does not already exist.
    if (fileSize == 0 && blockLocations.length == 0) {
        blockLocations = new BlockLocation[] { new BlockLocation() };
        // Turn off force local scheduling because hosts list doesn't exist.
        forceLocalScheduling = false;
    }

    ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder();
    for (BlockLocation blockLocation : blockLocations) {
        // clamp the block range
        long blockStart = Math.max(start, blockLocation.getOffset());
        long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength());
        if (blockStart > blockEnd) {
            // block is outside split range
            continue;
        }
        if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) {
            // skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval.
            continue;
        }
        blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation)));
    }
    List<InternalHiveBlock> blocks = blockBuilder.build();
    checkBlocks(blocks, start, length);

    if (!splittable) {
        // not splittable, use the hosts from the first block if it exists
        blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses()));
    }

    return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, fileSize, schema,
            partitionKeys, blocks, bucketNumber, splittable,
            forceLocalScheduling && allBlocksHaveRealAddress(blocks), columnCoercions, bucketConversion,
            s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path)));
}

From source file:com.facebook.presto.hive.util.TestAsyncRecursiveWalker.java

License:Apache License

private static FileSystem createMockFileSystem(final Map<String, List<FileStatus>> paths) {
    return new StubFileSystem() {
        @Override/*  ww  w  .j a  v a  2s .co m*/
        public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) throws IOException {
            ImmutableList.Builder<LocatedFileStatus> list = ImmutableList.builder();
            for (FileStatus status : paths.get(f.toString())) {
                list.add(new LocatedFileStatus(status, new BlockLocation[0]));
            }
            return remoteIterator(list.build().iterator());
        }

        @Override
        public FileStatus[] listStatus(Path f) throws IOException {
            List<FileStatus> fileStatuses = paths.get(f.toString());
            return fileStatuses.toArray(new FileStatus[fileStatuses.size()]);
        }
    };
}

From source file:com.facebook.presto.raptor.storage.SyncingFileSystem.java

License:Apache License

@Override
public FSDataOutputStream create(Path path, boolean overwrite, int bufferSize, short replication,
        long blockSize, Progressable progress) throws IOException {
    if (exists(path) && !overwrite) {
        throw new IOException("file already exists: " + path);
    }//from  w w w.  j  ava  2 s .c om
    Path parent = path.getParent();
    if ((parent != null) && !mkdirs(parent)) {
        throw new IOException("mkdirs failed to create " + parent.toString());
    }
    return new FSDataOutputStream(
            new BufferedOutputStream(new LocalFileOutputStream(pathToFile(path)), bufferSize), statistics);
}

From source file:com.finderbots.miner.MinerWorkflow.java

License:Apache License

public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException {

    SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
    JobConf defaultJobConf = HadoopUtils.getDefaultJobConf();

    InputStream is = null;/*from  w  w w  .  java  2  s  . c om*/
    TupleEntryCollector writer = null;
    try {
        Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true);
        writer = urlSink.openForWrite(defaultJobConf);

        //gak this should be an HDFS file of seeds we iterate through
        is = MinerWorkflow.class.getResourceAsStream(fileName);

        if (is == null) {
            throw new FileNotFoundException("The seed urls file doesn't exist");
        }

        //read all the lines from the hadoop file
        List<String> lines = IOUtils.readLines(is);
        for (String line : lines) {
            line = line.trim();
            if (line.startsWith("#")) {
                continue;
            }

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f,
                    0.0f);
            writer.add(datum.getTuple());
        }

        writer.close();
    } catch (IOException e) {
        HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath);
        throw e;
    } finally {
        IoUtils.safeClose(is);
        if (writer != null) {
            writer.close();
        }
    }

}

From source file:com.finderbots.miner.MinerWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, MinerOptions options, BaseUrlFilter crawlUrlFilter, BaseUrlFilter mineUrlFilter)
        throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(MinerWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }//w w w .j ava2s  . c  om

    //Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString());
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString(), true);
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    //add a regex url filter to filter outlinks
    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());
    if (crawlUrlFilter != null) {
        outlinksPipe = new Each(outlinksPipe, new UrlFilter(crawlUrlFilter));
    }

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}

From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerWorkflow.java

License:Apache License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, AnalyzeHtml analyzer,
        PinterestCrawlAndMinerTool.Options options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    Properties props = HadoopUtils.getDefaultProperties(PinterestCrawlAndMinerWorkflow.class,
            options.isDebugLogging(), conf);
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath);
    }/*w  w  w  .  ja  v a2  s.  c om*/

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining
    // always pass in true!!!
    SimpleParser parser;
    if (options.isUseBoilerpipe()) {
        parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(),
                new ParserPolicy());
    } else if (options.isGenerateHTML()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else if (options.isEnableMiner()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else {
        parser = new SimpleParser();
    }

    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Tap writableSeqFileSink = null;
    Pipe writableSeqFileDataPipe = null;

    // Create the output map that connects each tail pipe to the appropriate sink, and the
    // list of tail pipes.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    List<Pipe> tailPipes = new ArrayList<Pipe>();

    if (options.isGenerateHTML()) {
        // Let's write out the parse as text:
        Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN),
                new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true),
                Fields.REPLACE);
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN),
                new Identity());
        Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
        Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true);
        sinkMap.put(textParsePipe.getName(), textParseTap);
        tailPipes.add(textParsePipe);
    }

    if (options.isEnableMiner()) {
        Pipe analyzerPipe = new Pipe("analyzer pipe", parsePipe.getTailPipe());
        analyzerPipe = new Each(analyzerPipe, analyzer);

        Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
        resultsPipe = new Each(resultsPipe, new CreateBooleanPreferenceFunction());

        Path minerOutputPath = new Path(curWorkingDirPath, CrawlConfig.MINER_SUBDIR_NAME);
        Tap minerOutputTap = new Hfs(new TextLine(), minerOutputPath.toString(), true);
        sinkMap.put(resultsPipe.getName(), minerOutputTap);
        tailPipes.add(resultsPipe);
    }

    // Let's output a WritableSequenceFile as an example - this file can
    // then be used as input when working with Mahout.
    writableSeqFileDataPipe = new Pipe("writable seqfile data",
            new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));

    Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
    writableSeqFileSink = new Hfs(new WritableSequenceFile(
            new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN),
            Text.class, Text.class), writableSeqFileDataPath.toString());

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe,
            new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
    if (urlFilter != null) {
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    }

    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    sinkMap.put(statusPipe.getName(), statusSink);
    tailPipes.add(statusPipe);

    sinkMap.put(contentPipe.getName(), contentSink);
    tailPipes.add(contentPipe);

    sinkMap.put(parsePipe.getTailPipe().getName(), parseSink);
    tailPipes.add(parsePipe.getTailPipe());

    sinkMap.put(outputPipe.getName(), loopCrawldbSink);
    tailPipes.add(outputPipe);

    sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
    tailPipes.add(writableSeqFileDataPipe);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()]));

    return flow;
}

From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerWorkflow.java

License:Apache License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, MineRTCriticsPreferences prefsAnalyzer,
        RTCriticsCrawlAndMinerTool.Options options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    Properties props = HadoopUtils.getDefaultProperties(RTCriticsCrawlAndMinerWorkflow.class,
            options.isDebugLogging(), conf);
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath);
    }// ww w  .j  a v  a2  s .  co m

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining
    // always pass in true!!!
    SimpleParser parser;
    if (options.isUseBoilerpipe()) {
        parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(),
                new ParserPolicy());
    } else if (options.isGenerateHTML()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else if (options.isEnableMiner()) {
        parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(),
                true);
    } else {
        parser = new SimpleParser();
    }

    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Tap writableSeqFileSink = null;
    Pipe writableSeqFileDataPipe = null;

    // Create the output map that connects each tail pipe to the appropriate sink, and the
    // list of tail pipes.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    List<Pipe> tailPipes = new ArrayList<Pipe>();

    if (options.isGenerateHTML()) {
        // Let's write out the parse as text:
        Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe());
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN),
                new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true),
                Fields.REPLACE);
        textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN),
                new Identity());
        Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME);
        Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true);
        sinkMap.put(textParsePipe.getName(), textParseTap);
        tailPipes.add(textParsePipe);
    }

    if (options.isEnableMiner()) { //all the miner assembly happens here
        // analyze all pages that are to be mined, create an RTPageDatum
        // that will have data for /m/ page OR /critic/ page but not both
        // todo: in a perfect world there would be two datum types and we would
        // split them before analysis but it's nice to have all anaylysis in a single
        // function--maybe?
        Pipe prefsAnalyzerPipe = new Pipe("RT critics analyzer pipe", parsePipe.getTailPipe());
        prefsAnalyzerPipe = new Each(prefsAnalyzerPipe, prefsAnalyzer);

        // take all RTPageDatum, create a text line TSV then write to a output Tap
        Pipe prefsPipe = new Pipe("prefs pipe", prefsAnalyzerPipe);
        prefsPipe = new Each(prefsPipe, new CreateRTCriticsPrefsFunction());
        //todo, should we run through Unique?
        Path outPrefsPath = new Path(curWorkingDirPath, "prefs");
        Tap outPrefsTap = new Hfs(new TextLine(), outPrefsPath.toString(), true);
        sinkMap.put(prefsPipe.getName(), outPrefsTap);
        tailPipes.add(prefsPipe);

        // take all RTPageDatum, filter out all but /m/ pages
        // make sure they are unique, create a TSV line per datum,
        // write to an output Tap
        Pipe filterMedia = new Pipe("filter_out_all_but_media_datum", prefsAnalyzerPipe);
        filterMedia = new Each(filterMedia, new FilterMediaDatumFunction());
        Pipe mediaPipe = new Pipe("create_media_records", filterMedia);
        mediaPipe = new Each(mediaPipe, new CreateRTMediaRecordsFunction());
        Pipe uniqueMedia = new Unique("uniquify_media_records", mediaPipe, new Fields("line"));

        Path outMediaPath = new Path(curWorkingDirPath, "media");
        Tap outMediaTap = new Hfs(new TextLine(), outMediaPath.toString(), true);
        sinkMap.put(uniqueMedia.getName(), outMediaTap);
        tailPipes.add(uniqueMedia);

    }

    // Let's output a WritableSequenceFile as an example - this file can
    // then be used as input when working with Mahout.
    writableSeqFileDataPipe = new Pipe("writable seqfile data",
            new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData()));

    Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME);
    writableSeqFileSink = new Hfs(new WritableSequenceFile(
            new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN),
            Text.class, Text.class), writableSeqFileDataPath.toString());

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe,
            new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()));
    if (urlFilter != null) {
        urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    }

    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe);
    urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    sinkMap.put(statusPipe.getName(), statusSink);
    tailPipes.add(statusPipe);

    sinkMap.put(contentPipe.getName(), contentSink);
    tailPipes.add(contentPipe);

    sinkMap.put(parsePipe.getTailPipe().getName(), parseSink);
    tailPipes.add(parsePipe.getTailPipe());

    sinkMap.put(outputPipe.getName(), loopCrawldbSink);
    tailPipes.add(outputPipe);

    sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink);
    tailPipes.add(writableSeqFileDataPipe);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()]));

    return flow;
}

From source file:com.fullcontact.cassandra.io.util.RandomAccessReader.java

License:Apache License

protected RandomAccessReader(Path file, int bufferSize, boolean skipIOCache, PoolingSegmentedFile owner,
        FileSystem fs) throws FileNotFoundException {
    inputPath = file;// w w w  .j  a v a2  s  . c om
    try {
        inputFileStatus = fs.getFileStatus(inputPath);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    this.fs = fs;

    try {
        this.input = fs.open(file);
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    this.owner = owner;

    filePath = file.toString();

    // allocating required size of the buffer
    if (bufferSize <= 0)
        throw new IllegalArgumentException("bufferSize must be positive");
    buffer = new byte[bufferSize];

    this.skipIOCache = skipIOCache;

    // we can cache file length in read-only mode
    try {
        fileLength = fs.getFileStatus(file).getLen();
    } catch (IOException e) {
        throw new FSReadError(e, filePath);
    }
    validBufferBytes = -1; // that will trigger reBuffer() on demand by read/seek operations
}

From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(final JobContext context, final Path filename) {
    if (SSTablePredicates.IS_SSTABLE.apply(filename.toString())) {
        LOG.debug("{} is splittable.", filename);
        return indexes.containsKey(filename);
    } else {//w ww  .  j  ava 2s .c  om
        // Delegate non-sstable files to the FileInputFormat base class.
        LOG.debug("{} is not splittable.", filename);
        return super.isSplitable(context, filename);
    }
}

From source file:com.fullcontact.sstable.index.SSTableIndexIndexer.java

License:Apache License

public void index(final Path sstablePath) throws IOException {

    final FileSystem fileSystem = FileSystem.get(URI.create(sstablePath.toString()), configuration);
    final FileStatus fileStatus = fileSystem.getFileStatus(sstablePath);

    if (fileStatus.isDir()) {
        LOG.info("SSTable Indexing directory {}", sstablePath);
        final FileStatus[] statuses = fileSystem.listStatus(sstablePath);
        for (final FileStatus childStatus : statuses) {
            index(childStatus.getPath());
        }//from  w  ww.  j  ava  2  s  .co  m
    } else if (sstablePath.toString().endsWith(SST_EXTENSION)) {
        final Path sstableIndexPath = new Path(sstablePath.toString() + SSTableIndexIndex.SSTABLE_INDEX_SUFFIX);
        if (fileSystem.exists(sstableIndexPath)) {
            LOG.info("Skipping as SSTable index file already exists for {}", sstablePath);
        } else {
            // Kick a thread for the index.
            final ListenableFuture<IndexRequest> indexFuture = service.submit(new Callable<IndexRequest>() {
                @Override
                public IndexRequest call() throws Exception {
                    final long startTime = System.currentTimeMillis();
                    final long fileSize = fileStatus.getLen();

                    LOG.info("Indexing SSTABLE Indexing file {}, size {} GB...", sstablePath,
                            decimalFormat.format(fileSize / (1024.0 * 1024.0 * 1024.0)));

                    indexSingleFile(fileSystem, sstablePath);

                    return new IndexRequest(sstableIndexPath, startTime, fileSize);
                }
            });

            Futures.addCallback(indexFuture, new FutureCallback<IndexRequest>() {
                public void onSuccess(final IndexRequest indexRequest) {
                    long indexSize = 0;

                    try {
                        indexSize = fileSystem.getFileStatus(indexRequest.getIndexPath()).getLen();
                    } catch (IOException e) {
                        LOG.error("Error getting file status for index path: {}", indexRequest.getIndexPath());
                    }

                    final double elapsed = (System.currentTimeMillis() - indexRequest.getStartTime()) / 1000.0;

                    LOG.info("Completed SSTABLE Indexing in {} seconds ({} MB/s).  Index size is {} KB.",
                            decimalFormat.format(elapsed),
                            decimalFormat.format(indexRequest.getFileSize() / (1024.0 * 1024.0 * elapsed)),
                            decimalFormat.format(indexSize / 1024.0));
                }

                public void onFailure(Throwable e) {
                    LOG.error("Failed to index.", e);
                }
            });

        }
    }
}