List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.facebook.presto.hive.util.InternalHiveSplitFactory.java
License:Apache License
private Optional<InternalHiveSplit> createInternalHiveSplit(Path path, BlockLocation[] blockLocations, long start, long length, long fileSize, OptionalInt bucketNumber, boolean splittable) { String pathString = path.toString(); if (!pathMatchesPredicate(pathDomain, pathString)) { return Optional.empty(); }/*from w ww . j a va 2 s . c om*/ boolean forceLocalScheduling = this.forceLocalScheduling; // For empty files, some filesystem (e.g. LocalFileSystem) produce one empty block // while others (e.g. hdfs.DistributedFileSystem) produces no block. // Synthesize an empty block if one does not already exist. if (fileSize == 0 && blockLocations.length == 0) { blockLocations = new BlockLocation[] { new BlockLocation() }; // Turn off force local scheduling because hosts list doesn't exist. forceLocalScheduling = false; } ImmutableList.Builder<InternalHiveBlock> blockBuilder = ImmutableList.builder(); for (BlockLocation blockLocation : blockLocations) { // clamp the block range long blockStart = Math.max(start, blockLocation.getOffset()); long blockEnd = Math.min(start + length, blockLocation.getOffset() + blockLocation.getLength()); if (blockStart > blockEnd) { // block is outside split range continue; } if (blockStart == blockEnd && !(blockStart == start && blockEnd == start + length)) { // skip zero-width block, except in the special circumstance: slice is empty, and the block covers the empty slice interval. continue; } blockBuilder.add(new InternalHiveBlock(blockStart, blockEnd, getHostAddresses(blockLocation))); } List<InternalHiveBlock> blocks = blockBuilder.build(); checkBlocks(blocks, start, length); if (!splittable) { // not splittable, use the hosts from the first block if it exists blocks = ImmutableList.of(new InternalHiveBlock(start, start + length, blocks.get(0).getAddresses())); } return Optional.of(new InternalHiveSplit(partitionName, pathString, start, start + length, fileSize, schema, partitionKeys, blocks, bucketNumber, splittable, forceLocalScheduling && allBlocksHaveRealAddress(blocks), columnCoercions, bucketConversion, s3SelectPushdownEnabled && S3SelectPushdown.isCompressionCodecSupported(inputFormat, path))); }
From source file:com.facebook.presto.hive.util.TestAsyncRecursiveWalker.java
License:Apache License
private static FileSystem createMockFileSystem(final Map<String, List<FileStatus>> paths) { return new StubFileSystem() { @Override/* ww w .j a v a 2s .co m*/ public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f) throws IOException { ImmutableList.Builder<LocatedFileStatus> list = ImmutableList.builder(); for (FileStatus status : paths.get(f.toString())) { list.add(new LocatedFileStatus(status, new BlockLocation[0])); } return remoteIterator(list.build().iterator()); } @Override public FileStatus[] listStatus(Path f) throws IOException { List<FileStatus> fileStatuses = paths.get(f.toString()); return fileStatuses.toArray(new FileStatus[fileStatuses.size()]); } }; }
From source file:com.facebook.presto.raptor.storage.SyncingFileSystem.java
License:Apache License
@Override public FSDataOutputStream create(Path path, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { if (exists(path) && !overwrite) { throw new IOException("file already exists: " + path); }//from w w w. j ava 2 s .c om Path parent = path.getParent(); if ((parent != null) && !mkdirs(parent)) { throw new IOException("mkdirs failed to create " + parent.toString()); } return new FSDataOutputStream( new BufferedOutputStream(new LocalFileOutputStream(pathToFile(path)), bufferSize), statistics); }
From source file:com.finderbots.miner.MinerWorkflow.java
License:Apache License
public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException { SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer(); JobConf defaultJobConf = HadoopUtils.getDefaultJobConf(); InputStream is = null;/*from w w w . java 2 s . c om*/ TupleEntryCollector writer = null; try { Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true); writer = urlSink.openForWrite(defaultJobConf); //gak this should be an HDFS file of seeds we iterate through is = MinerWorkflow.class.getResourceAsStream(fileName); if (is == null) { throw new FileNotFoundException("The seed urls file doesn't exist"); } //read all the lines from the hadoop file List<String> lines = IOUtils.readLines(is); for (String line : lines) { line = line.trim(); if (line.startsWith("#")) { continue; } CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f, 0.0f); writer.add(datum.getTuple()); } writer.close(); } catch (IOException e) { HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath); throw e; } finally { IoUtils.safeClose(is); if (writer != null) { writer.close(); } } }
From source file:com.finderbots.miner.MinerWorkflow.java
License:Apache License
public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, MinerOptions options, BaseUrlFilter crawlUrlFilter, BaseUrlFilter mineUrlFilter) throws IOException, InterruptedException { // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir. // HTML only. // We want to extract the cleaned up HTML, and pass that to the parser, which will // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and // any results. JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); boolean isLocal = HadoopUtils.isJobLocal(conf); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); conf.setInt("mapred.min.split.size", 64 * 1024 * 1024); Properties props = HadoopUtils.getDefaultProperties(MinerWorkflow.class, false, conf); FileSystem fs = crawlDbPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb not found"); }//w w w .j ava2s . c om //Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES), crawlDbPath.toString()); Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString(), true); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums()); Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe()); Pipe urlsToFetchPipe = splitter.getLHSPipe(); // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, // or MAX_LOCAL_FETCH if running locally. So first we sort the entries // from high to low by links score. // TODO add unit test urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true); long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH; urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch)); // Create the sub-assembly that runs the fetch job int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER; SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES); fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT); fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content. SimpleParser parser = new SimpleParser(new ParserPolicy(), true); ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser); Pipe analyzerPipe = new Pipe("analyzer pipe"); analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml()); //add a regex url filter to filter outlinks Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe); outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction()); if (crawlUrlFilter != null) { outlinksPipe = new Each(outlinksPipe, new UrlFilter(crawlUrlFilter)); } Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateResultsFunction()); // Group the finished datums, the skipped datums, status, outlinks Pipe updatePipe = new CoGroup("update pipe", Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe), Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN), new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)), null, new OuterJoin()); updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS); // output : loop dir specific crawldb Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawlDbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); // Status, Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Content Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); // PageResults Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME); Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString()); // Create the output map that connects each tail pipe to the appropriate sink. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); sinkMap.put(updatePipe.getName(), crawlDbSink); sinkMap.put(statusPipe.getName(), statusSink); sinkMap.put(contentPipe.getName(), contentSink); sinkMap.put(resultsPipe.getName(), resultsSink); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe); return flow; }
From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, AnalyzeHtml analyzer, PinterestCrawlAndMinerTool.Options options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(PinterestCrawlAndMinerWorkflow.class, options.isDebugLogging(), conf); FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath); }/*w w w . ja v a2 s. c om*/ // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining // always pass in true!!! SimpleParser parser; if (options.isUseBoilerpipe()) { parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy()); } else if (options.isGenerateHTML()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else if (options.isEnableMiner()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else { parser = new SimpleParser(); } parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Tap writableSeqFileSink = null; Pipe writableSeqFileDataPipe = null; // Create the output map that connects each tail pipe to the appropriate sink, and the // list of tail pipes. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); List<Pipe> tailPipes = new ArrayList<Pipe>(); if (options.isGenerateHTML()) { // Let's write out the parse as text: Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe()); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity()); Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME); Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true); sinkMap.put(textParsePipe.getName(), textParseTap); tailPipes.add(textParsePipe); } if (options.isEnableMiner()) { Pipe analyzerPipe = new Pipe("analyzer pipe", parsePipe.getTailPipe()); analyzerPipe = new Each(analyzerPipe, analyzer); Pipe resultsPipe = new Pipe("results pipe", analyzerPipe); resultsPipe = new Each(resultsPipe, new CreateBooleanPreferenceFunction()); Path minerOutputPath = new Path(curWorkingDirPath, CrawlConfig.MINER_SUBDIR_NAME); Tap minerOutputTap = new Hfs(new TextLine(), minerOutputPath.toString(), true); sinkMap.put(resultsPipe.getName(), minerOutputTap); tailPipes.add(resultsPipe); } // Let's output a WritableSequenceFile as an example - this file can // then be used as input when working with Mahout. writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData())); Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME); writableSeqFileSink = new Hfs(new WritableSequenceFile( new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class), writableSeqFileDataPath.toString()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. sinkMap.put(statusPipe.getName(), statusSink); tailPipes.add(statusPipe); sinkMap.put(contentPipe.getName(), contentSink); tailPipes.add(contentPipe); sinkMap.put(parsePipe.getTailPipe().getName(), parseSink); tailPipes.add(parsePipe.getTailPipe()); sinkMap.put(outputPipe.getName(), loopCrawldbSink); tailPipes.add(outputPipe); sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink); tailPipes.add(writableSeqFileDataPipe); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()])); return flow; }
From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerWorkflow.java
License:Apache License
public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy, UserAgent userAgent, BaseUrlFilter urlFilter, MineRTCriticsPreferences prefsAnalyzer, RTCriticsCrawlAndMinerTool.Options options) throws Throwable { JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB); int numReducers = HadoopUtils.getNumReducers(conf); conf.setNumReduceTasks(numReducers); Properties props = HadoopUtils.getDefaultProperties(RTCriticsCrawlAndMinerWorkflow.class, options.isDebugLogging(), conf); FileSystem fs = curWorkingDirPath.getFileSystem(conf); // Input : the crawldb if (!fs.exists(crawlDbPath)) { throw new RuntimeException("CrawlDb doesn't exist at " + crawlDbPath); }// ww w .j a v a2 s . co m // Our crawl db is defined by the CrawlDbDatum Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); Pipe importPipe = new Pipe("import pipe"); // Split into tuples that are to be fetched and that have already been fetched SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums()); Pipe finishedDatumsFromDb = splitter.getRHSPipe(); Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe()); // Convert the urlsToFetchPipe so that we now deal with UrlDatums. urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction()); // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output // of tuples by setting options.setDebugLogging() to true. urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true); // Create the output sinks : // crawldb // content // parse // status Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString()); Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME); Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString()); Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME); Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString()); Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString()); // Create the sub-assembly that runs the fetch job SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent); fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);// set to two tries fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);// and 10 sec timeout fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT); // You can also provide a set of mime types you want to restrict what content type you // want to deal with - for now keep it simple. Set<String> validMimeTypes = new HashSet<String>(); validMimeTypes.add("text/plain"); validMimeTypes.add("text/html"); fetcherPolicy.setValidMimeTypes(validMimeTypes); // The scorer is used by the FetchPipe to assign a score to every URL that passes the // robots.txt processing. The score is used to sort URLs such that higher scoring URLs // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped. BaseScoreGenerator scorer = new FixedScoreGenerator(); FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers); Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe()); Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe()); contentPipe = TupleLogger.makePipe(contentPipe, true); // Take content and split it into content output plus parse to extract URLs. // BEWARE: The SimpleParser will discard HTML unless you pass in true as last arg! So for mining // always pass in true!!! SimpleParser parser; if (options.isUseBoilerpipe()) { parser = new SimpleParser(new BoilerpipeContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy()); } else if (options.isGenerateHTML()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else if (options.isEnableMiner()) { parser = new SimpleParser(new HtmlContentExtractor(), new SimpleLinkExtractor(), new ParserPolicy(), true); } else { parser = new SimpleParser(); } parser.setExtractLanguage(false); ParsePipe parsePipe = new ParsePipe(contentPipe, parser); Tap writableSeqFileSink = null; Pipe writableSeqFileDataPipe = null; // Create the output map that connects each tail pipe to the appropriate sink, and the // list of tail pipes. Map<String, Tap> sinkMap = new HashMap<String, Tap>(); List<Pipe> tailPipes = new ArrayList<Pipe>(); if (options.isGenerateHTML()) { // Let's write out the parse as text: Pipe textParsePipe = new Pipe("text parse data", parsePipe.getTailPipe()); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.PARSED_TEXT_FN), new RegexReplace(new Fields(ParsedDatum.PARSED_TEXT_FN), "[\\r\\n\\t]+", " ", true), Fields.REPLACE); textParsePipe = new Each(textParsePipe, new Fields(ParsedDatum.URL_FN, ParsedDatum.PARSED_TEXT_FN), new Identity()); Path textParsePath = new Path(curWorkingDirPath, CrawlConfig.HTML_SUBDIR_NAME); Tap textParseTap = new Hfs(new TextLine(), textParsePath.toString(), true); sinkMap.put(textParsePipe.getName(), textParseTap); tailPipes.add(textParsePipe); } if (options.isEnableMiner()) { //all the miner assembly happens here // analyze all pages that are to be mined, create an RTPageDatum // that will have data for /m/ page OR /critic/ page but not both // todo: in a perfect world there would be two datum types and we would // split them before analysis but it's nice to have all anaylysis in a single // function--maybe? Pipe prefsAnalyzerPipe = new Pipe("RT critics analyzer pipe", parsePipe.getTailPipe()); prefsAnalyzerPipe = new Each(prefsAnalyzerPipe, prefsAnalyzer); // take all RTPageDatum, create a text line TSV then write to a output Tap Pipe prefsPipe = new Pipe("prefs pipe", prefsAnalyzerPipe); prefsPipe = new Each(prefsPipe, new CreateRTCriticsPrefsFunction()); //todo, should we run through Unique? Path outPrefsPath = new Path(curWorkingDirPath, "prefs"); Tap outPrefsTap = new Hfs(new TextLine(), outPrefsPath.toString(), true); sinkMap.put(prefsPipe.getName(), outPrefsTap); tailPipes.add(prefsPipe); // take all RTPageDatum, filter out all but /m/ pages // make sure they are unique, create a TSV line per datum, // write to an output Tap Pipe filterMedia = new Pipe("filter_out_all_but_media_datum", prefsAnalyzerPipe); filterMedia = new Each(filterMedia, new FilterMediaDatumFunction()); Pipe mediaPipe = new Pipe("create_media_records", filterMedia); mediaPipe = new Each(mediaPipe, new CreateRTMediaRecordsFunction()); Pipe uniqueMedia = new Unique("uniquify_media_records", mediaPipe, new Fields("line")); Path outMediaPath = new Path(curWorkingDirPath, "media"); Tap outMediaTap = new Hfs(new TextLine(), outMediaPath.toString(), true); sinkMap.put(uniqueMedia.getName(), outMediaTap); tailPipes.add(uniqueMedia); } // Let's output a WritableSequenceFile as an example - this file can // then be used as input when working with Mahout. writableSeqFileDataPipe = new Pipe("writable seqfile data", new Each(parsePipe.getTailPipe(), new CreateWritableSeqFileData())); Path writableSeqFileDataPath = new Path(curWorkingDirPath, CrawlConfig.EXTRACTED_TEXT_SUBDIR_NAME); writableSeqFileSink = new Hfs(new WritableSequenceFile( new Fields(CrawlConfig.WRITABLE_SEQ_FILE_KEY_FN, CrawlConfig.WRITABLE_SEQ_FILE_VALUE_FN), Text.class, Text.class), writableSeqFileDataPath.toString()); Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe()); urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator())); if (urlFilter != null) { urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter)); } urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer())); urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true); // Take status and output urls from it Pipe urlFromFetchPipe = new Pipe("url from fetch", statusPipe); urlFromFetchPipe = new Each(urlFromFetchPipe, new CreateUrlDatumFromStatusFunction()); urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true); // Finally join the URLs we get from parsing content with the URLs we got // from the status ouput, and the urls we didn't process from the db so that // we have a unified stream of all known URLs for the crawldb. Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction()); finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true); // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums // and then doing the extra step of converting from UrlDatum to CrawlDbDatum. // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool Pipe crawlDbPipe = new GroupBy("crawldb pipe", Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe), new Fields(UrlDatum.URL_FN)); crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS); Pipe outputPipe = new Pipe("output pipe"); outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction()); // Create the output map that connects each tail pipe to the appropriate sink. sinkMap.put(statusPipe.getName(), statusSink); tailPipes.add(statusPipe); sinkMap.put(contentPipe.getName(), contentSink); tailPipes.add(contentPipe); sinkMap.put(parsePipe.getTailPipe().getName(), parseSink); tailPipes.add(parsePipe.getTailPipe()); sinkMap.put(outputPipe.getName(), loopCrawldbSink); tailPipes.add(outputPipe); sinkMap.put(writableSeqFileDataPipe.getName(), writableSeqFileSink); tailPipes.add(writableSeqFileDataPipe); FlowConnector flowConnector = new FlowConnector(props); Flow flow = flowConnector.connect(inputSource, sinkMap, tailPipes.toArray(new Pipe[tailPipes.size()])); return flow; }
From source file:com.fullcontact.cassandra.io.util.RandomAccessReader.java
License:Apache License
protected RandomAccessReader(Path file, int bufferSize, boolean skipIOCache, PoolingSegmentedFile owner, FileSystem fs) throws FileNotFoundException { inputPath = file;// w w w .j a v a2 s . c om try { inputFileStatus = fs.getFileStatus(inputPath); } catch (IOException e) { throw new RuntimeException(e); } this.fs = fs; try { this.input = fs.open(file); } catch (IOException e) { throw new RuntimeException(e); } this.owner = owner; filePath = file.toString(); // allocating required size of the buffer if (bufferSize <= 0) throw new IllegalArgumentException("bufferSize must be positive"); buffer = new byte[bufferSize]; this.skipIOCache = skipIOCache; // we can cache file length in read-only mode try { fileLength = fs.getFileStatus(file).getLen(); } catch (IOException e) { throw new FSReadError(e, filePath); } validBufferBytes = -1; // that will trigger reBuffer() on demand by read/seek operations }
From source file:com.fullcontact.sstable.hadoop.mapreduce.SSTableInputFormat.java
License:Apache License
@Override protected boolean isSplitable(final JobContext context, final Path filename) { if (SSTablePredicates.IS_SSTABLE.apply(filename.toString())) { LOG.debug("{} is splittable.", filename); return indexes.containsKey(filename); } else {//w ww . j ava 2s .c om // Delegate non-sstable files to the FileInputFormat base class. LOG.debug("{} is not splittable.", filename); return super.isSplitable(context, filename); } }
From source file:com.fullcontact.sstable.index.SSTableIndexIndexer.java
License:Apache License
public void index(final Path sstablePath) throws IOException { final FileSystem fileSystem = FileSystem.get(URI.create(sstablePath.toString()), configuration); final FileStatus fileStatus = fileSystem.getFileStatus(sstablePath); if (fileStatus.isDir()) { LOG.info("SSTable Indexing directory {}", sstablePath); final FileStatus[] statuses = fileSystem.listStatus(sstablePath); for (final FileStatus childStatus : statuses) { index(childStatus.getPath()); }//from w ww. j ava 2 s .co m } else if (sstablePath.toString().endsWith(SST_EXTENSION)) { final Path sstableIndexPath = new Path(sstablePath.toString() + SSTableIndexIndex.SSTABLE_INDEX_SUFFIX); if (fileSystem.exists(sstableIndexPath)) { LOG.info("Skipping as SSTable index file already exists for {}", sstablePath); } else { // Kick a thread for the index. final ListenableFuture<IndexRequest> indexFuture = service.submit(new Callable<IndexRequest>() { @Override public IndexRequest call() throws Exception { final long startTime = System.currentTimeMillis(); final long fileSize = fileStatus.getLen(); LOG.info("Indexing SSTABLE Indexing file {}, size {} GB...", sstablePath, decimalFormat.format(fileSize / (1024.0 * 1024.0 * 1024.0))); indexSingleFile(fileSystem, sstablePath); return new IndexRequest(sstableIndexPath, startTime, fileSize); } }); Futures.addCallback(indexFuture, new FutureCallback<IndexRequest>() { public void onSuccess(final IndexRequest indexRequest) { long indexSize = 0; try { indexSize = fileSystem.getFileStatus(indexRequest.getIndexPath()).getLen(); } catch (IOException e) { LOG.error("Error getting file status for index path: {}", indexRequest.getIndexPath()); } final double elapsed = (System.currentTimeMillis() - indexRequest.getStartTime()) / 1000.0; LOG.info("Completed SSTABLE Indexing in {} seconds ({} MB/s). Index size is {} KB.", decimalFormat.format(elapsed), decimalFormat.format(indexRequest.getFileSize() / (1024.0 * 1024.0 * elapsed)), decimalFormat.format(indexSize / 1024.0)); } public void onFailure(Throwable e) { LOG.error("Failed to index.", e); } }); } } }