List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf()
From source file:azkaban.jobtype.connectors.TeradataToHdfsJobRunnerMain.java
License:Apache License
private void runCopyTdToHdfs() throws IOException { if (Boolean.valueOf(_jobProps.getProperty("force.output.overwrite", "false").trim())) { Path path = new Path(_jobProps.getProperty(TdchConstants.TARGET_HDFS_PATH_KEY)); _logger.info("Deleting output directory " + path.toUri()); JobConf conf = new JobConf(); path.getFileSystem(conf).delete(path, true); }//from w ww . j a va2s.c o m _logger.info(String.format("Executing %s with params: %s", TeradataToHdfsJobRunnerMain.class.getSimpleName(), _params)); TeradataImportTool.main(_params.toTdchParams()); }
From source file:azkaban.jobtype.javautils.AbstractHadoopJob.java
License:Apache License
@SuppressWarnings("rawtypes") public JobConf createJobConf(Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass) throws IOException, URISyntaxException { JobConf conf = new JobConf(); // set custom class loader with custom find resource strategy. conf.setJobName(getJobName());/*from w ww.j av a 2s.c om*/ conf.setMapperClass(mapperClass); if (reducerClass != null) { conf.setReducerClass(reducerClass); } if (props.getBoolean("is.local", false)) { conf.set("mapred.job.tracker", "local"); conf.set("fs.default.name", "file:///"); conf.set("mapred.local.dir", "/tmp/map-red"); logger.info("Running locally, no hadoop jar set."); } else { HadoopUtils.setClassLoaderAndJar(conf, getClass()); logger.info("Setting hadoop jar file for class:" + getClass() + " to " + conf.getJar()); logger.info("*************************************************************************"); logger.info( " Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ") "); logger.info("*************************************************************************"); } // set JVM options if present if (props.containsKey("mapred.child.java.opts")) { conf.set("mapred.child.java.opts", props.getString("mapred.child.java.opts")); logger.info("mapred.child.java.opts set to " + props.getString("mapred.child.java.opts")); } // set input and output paths if they are present if (props.containsKey("input.paths")) { List<String> inputPaths = props.getStringList("input.paths"); if (inputPaths.size() == 0) throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'"); for (String path : inputPaths) { HadoopUtils.addAllSubPaths(conf, new Path(path)); } } if (props.containsKey("output.path")) { String location = props.get("output.path"); FileOutputFormat.setOutputPath(conf, new Path(location)); // For testing purpose only remove output file if exists if (props.getBoolean("force.output.overwrite", false)) { FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf); fs.delete(FileOutputFormat.getOutputPath(conf), true); } } // Adds External jars to hadoop classpath String externalJarList = props.getString("hadoop.external.jarFiles", null); if (externalJarList != null) { FileSystem fs = FileSystem.get(conf); String[] jarFiles = externalJarList.split(","); for (String jarFile : jarFiles) { logger.info("Adding extenral jar File:" + jarFile); DistributedCache.addFileToClassPath(new Path(jarFile), conf, fs); } } // Adds distributed cache files String cacheFileList = props.getString("hadoop.cache.files", null); if (cacheFileList != null) { String[] cacheFiles = cacheFileList.split(","); for (String cacheFile : cacheFiles) { logger.info("Adding Distributed Cache File:" + cacheFile); DistributedCache.addCacheFile(new URI(cacheFile), conf); } } // Adds distributed cache files String archiveFileList = props.getString("hadoop.cache.archives", null); if (archiveFileList != null) { String[] archiveFiles = archiveFileList.split(","); for (String archiveFile : archiveFiles) { logger.info("Adding Distributed Cache Archive File:" + archiveFile); DistributedCache.addCacheArchive(new URI(archiveFile), conf); } } String hadoopCacheJarDir = props.getString("hdfs.default.classpath.dir", null); if (hadoopCacheJarDir != null) { FileSystem fs = FileSystem.get(conf); if (fs != null) { FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir)); if (status != null) { for (int i = 0; i < status.length; ++i) { if (!status[i].isDir()) { Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName()); logger.info("Adding Jar to Distributed Cache Archive File:" + path); DistributedCache.addFileToClassPath(path, conf, fs); } } } else { logger.info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty."); } } else { logger.info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " filesystem doesn't exist"); } } for (String key : getProps().getKeySet()) { String lowerCase = key.toLowerCase(); if (lowerCase.startsWith(HADOOP_PREFIX)) { String newKey = key.substring(HADOOP_PREFIX.length()); conf.set(newKey, getProps().get(key)); } } HadoopUtils.setPropsInJob(conf, getProps()); // put in tokens if (System.getenv(HADOOP_TOKEN_FILE_LOCATION) != null) { conf.set(MAPREDUCE_JOB_CREDENTIALS_BINARY, System.getenv(HADOOP_TOKEN_FILE_LOCATION)); } return conf; }
From source file:azkaban.security.HadoopSecurityManager_H_1_0.java
License:Apache License
@Override public synchronized void prefetchToken(final File tokenFile, final Props props, final Logger logger) throws HadoopSecurityManagerException { final String userToProxy = props.getString(USER_TO_PROXY); logger.info("Getting hadoop tokens for " + userToProxy); final Credentials cred = new Credentials(); if (props.getBoolean(OBTAIN_HCAT_TOKEN, false)) { try {/*from w ww . j av a2s . co m*/ logger.info("Pre-fetching Hive MetaStore token from hive"); HiveConf hiveConf = new HiveConf(); logger.info("HiveConf.ConfVars.METASTOREURIS.varname " + hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname)); logger.info("HIVE_METASTORE_SASL_ENABLED " + hiveConf.get(HIVE_METASTORE_SASL_ENABLED)); logger.info("HIVE_METASTORE_KERBEROS_PRINCIPAL " + hiveConf.get(HIVE_METASTORE_KERBEROS_PRINCIPAL)); logger.info("HIVE_METASTORE_LOCAL " + hiveConf.get(HIVE_METASTORE_LOCAL)); HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf); String hcatTokenStr = hiveClient.getDelegationToken(userToProxy, UserGroupInformation.getLoginUser().getShortUserName()); Token<DelegationTokenIdentifier> hcatToken = new Token<DelegationTokenIdentifier>(); hcatToken.decodeFromUrlString(hcatTokenStr); logger.info("Created hive metastore token: " + hcatTokenStr); logger.info("Token kind: " + hcatToken.getKind()); logger.info("Token id: " + hcatToken.getIdentifier()); logger.info("Token service: " + hcatToken.getService()); cred.addToken(hcatToken.getService(), hcatToken); } catch (Exception e) { e.printStackTrace(); logger.error("Failed to get hive metastore token." + e.getMessage() + e.getCause()); } catch (Throwable t) { t.printStackTrace(); logger.error("Failed to get hive metastore token." + t.getMessage() + t.getCause()); } } try { getProxiedUser(userToProxy).doAs(new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { getToken(userToProxy); return null; } private void getToken(String userToProxy) throws InterruptedException, IOException, HadoopSecurityManagerException { logger.info("Here is the props for " + OBTAIN_NAMENODE_TOKEN + ": " + props.getBoolean(OBTAIN_NAMENODE_TOKEN)); if (props.getBoolean(OBTAIN_NAMENODE_TOKEN, false)) { FileSystem fs = FileSystem.get(conf); // check if we get the correct FS, and most importantly, the // conf logger.info("Getting DFS token from " + fs.getUri()); Token<?> fsToken = fs.getDelegationToken(userToProxy); if (fsToken == null) { logger.error("Failed to fetch DFS token for "); throw new HadoopSecurityManagerException( "Failed to fetch DFS token for " + userToProxy); } logger.info("Created DFS token: " + fsToken.toString()); logger.info("Token kind: " + fsToken.getKind()); logger.info("Token id: " + fsToken.getIdentifier()); logger.info("Token service: " + fsToken.getService()); cred.addToken(fsToken.getService(), fsToken); } if (props.getBoolean(OBTAIN_JOBTRACKER_TOKEN, false)) { JobClient jobClient = new JobClient(new JobConf()); logger.info("Pre-fetching JT token from JobTracker"); Token<DelegationTokenIdentifier> mrdt = jobClient.getDelegationToken(new Text("mr token")); if (mrdt == null) { logger.error("Failed to fetch JT token"); throw new HadoopSecurityManagerException("Failed to fetch JT token for " + userToProxy); } logger.info("Created JT token: " + mrdt.toString()); logger.info("Token kind: " + mrdt.getKind()); logger.info("Token id: " + mrdt.getIdentifier()); logger.info("Token service: " + mrdt.getService()); cred.addToken(mrdt.getService(), mrdt); } } }); FileOutputStream fos = null; DataOutputStream dos = null; try { fos = new FileOutputStream(tokenFile); dos = new DataOutputStream(fos); cred.writeTokenStorageToStream(dos); } finally { if (dos != null) { dos.close(); } if (fos != null) { fos.close(); } } // stash them to cancel after use. logger.info("Tokens loaded in " + tokenFile.getAbsolutePath()); } catch (Exception e) { e.printStackTrace(); throw new HadoopSecurityManagerException( "Failed to get hadoop tokens! " + e.getMessage() + e.getCause()); } catch (Throwable t) { t.printStackTrace(); throw new HadoopSecurityManagerException( "Failed to get hadoop tokens! " + t.getMessage() + t.getCause()); } }
From source file:azkaban.security.HadoopSecurityManager_H_2_0.java
License:Apache License
@Override public synchronized void prefetchToken(final File tokenFile, final Props props, final Logger logger) throws HadoopSecurityManagerException { final String userToProxy = props.getString(USER_TO_PROXY); logger.info("Getting hadoop tokens based on props for " + userToProxy); final Credentials cred = new Credentials(); if (props.getBoolean(OBTAIN_HCAT_TOKEN, false)) { try {/* w w w. j av a 2 s. c o m*/ // first we fetch and save the default hcat token. logger.info("Pre-fetching default Hive MetaStore token from hive"); HiveConf hiveConf = new HiveConf(); Token<DelegationTokenIdentifier> hcatToken = fetchHcatToken(userToProxy, hiveConf, null, logger); cred.addToken(hcatToken.getService(), hcatToken); // check and see if user specified the extra hcat locations we need to // look at and fetch token. final List<String> extraHcatLocations = props.getStringList(EXTRA_HCAT_LOCATION); if (Collections.EMPTY_LIST != extraHcatLocations) { logger.info("Need to pre-fetch extra metaStore tokens from hive."); // start to process the user inputs. for (String thriftUrl : extraHcatLocations) { logger.info("Pre-fetching metaStore token from : " + thriftUrl); hiveConf = new HiveConf(); hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, thriftUrl); hcatToken = fetchHcatToken(userToProxy, hiveConf, thriftUrl, logger); cred.addToken(hcatToken.getService(), hcatToken); } } } catch (Throwable t) { String message = "Failed to get hive metastore token." + t.getMessage() + t.getCause(); logger.error(message, t); throw new HadoopSecurityManagerException(message); } } if (props.getBoolean(OBTAIN_JOBHISTORYSERVER_TOKEN, false)) { YarnRPC rpc = YarnRPC.create(conf); final String serviceAddr = conf.get(JHAdminConfig.MR_HISTORY_ADDRESS); logger.debug("Connecting to HistoryServer at: " + serviceAddr); HSClientProtocol hsProxy = (HSClientProtocol) rpc.getProxy(HSClientProtocol.class, NetUtils.createSocketAddr(serviceAddr), conf); logger.info("Pre-fetching JH token from job history server"); Token<?> jhsdt = null; try { jhsdt = getDelegationTokenFromHS(hsProxy); } catch (Exception e) { logger.error("Failed to fetch JH token", e); throw new HadoopSecurityManagerException("Failed to fetch JH token for " + userToProxy); } if (jhsdt == null) { logger.error("getDelegationTokenFromHS() returned null"); throw new HadoopSecurityManagerException("Unable to fetch JH token for " + userToProxy); } logger.info("Created JH token: " + jhsdt.toString()); logger.info("Token kind: " + jhsdt.getKind()); logger.info("Token id: " + jhsdt.getIdentifier()); logger.info("Token service: " + jhsdt.getService()); cred.addToken(jhsdt.getService(), jhsdt); } try { getProxiedUser(userToProxy).doAs(new PrivilegedExceptionAction<Void>() { @Override public Void run() throws Exception { getToken(userToProxy); return null; } private void getToken(String userToProxy) throws InterruptedException, IOException, HadoopSecurityManagerException { logger.info("Here is the props for " + OBTAIN_NAMENODE_TOKEN + ": " + props.getBoolean(OBTAIN_NAMENODE_TOKEN)); if (props.getBoolean(OBTAIN_NAMENODE_TOKEN, false)) { FileSystem fs = FileSystem.get(conf); // check if we get the correct FS, and most importantly, the // conf logger.info("Getting DFS token from " + fs.getUri()); Token<?> fsToken = fs .getDelegationToken(getMRTokenRenewerInternal(new JobConf()).toString()); if (fsToken == null) { logger.error("Failed to fetch DFS token for "); throw new HadoopSecurityManagerException( "Failed to fetch DFS token for " + userToProxy); } logger.info("Created DFS token: " + fsToken.toString()); logger.info("Token kind: " + fsToken.getKind()); logger.info("Token id: " + fsToken.getIdentifier()); logger.info("Token service: " + fsToken.getService()); cred.addToken(fsToken.getService(), fsToken); // getting additional name nodes tokens String otherNamenodes = props.get(OTHER_NAMENODES_TO_GET_TOKEN); if ((otherNamenodes != null) && (otherNamenodes.length() > 0)) { logger.info(OTHER_NAMENODES_TO_GET_TOKEN + ": '" + otherNamenodes + "'"); String[] nameNodeArr = otherNamenodes.split(","); Path[] ps = new Path[nameNodeArr.length]; for (int i = 0; i < ps.length; i++) { ps[i] = new Path(nameNodeArr[i].trim()); } TokenCache.obtainTokensForNamenodes(cred, ps, conf); logger.info("Successfully fetched tokens for: " + otherNamenodes); } else { logger.info(OTHER_NAMENODES_TO_GET_TOKEN + " was not configured"); } } if (props.getBoolean(OBTAIN_JOBTRACKER_TOKEN, false)) { JobConf jobConf = new JobConf(); JobClient jobClient = new JobClient(jobConf); logger.info("Pre-fetching JT token from JobTracker"); Token<DelegationTokenIdentifier> mrdt = jobClient .getDelegationToken(getMRTokenRenewerInternal(jobConf)); if (mrdt == null) { logger.error("Failed to fetch JT token"); throw new HadoopSecurityManagerException("Failed to fetch JT token for " + userToProxy); } logger.info("Created JT token: " + mrdt.toString()); logger.info("Token kind: " + mrdt.getKind()); logger.info("Token id: " + mrdt.getIdentifier()); logger.info("Token service: " + mrdt.getService()); cred.addToken(mrdt.getService(), mrdt); } } }); FileOutputStream fos = null; DataOutputStream dos = null; try { fos = new FileOutputStream(tokenFile); dos = new DataOutputStream(fos); cred.writeTokenStorageToStream(dos); } finally { if (dos != null) { try { dos.close(); } catch (Throwable t) { // best effort logger.error("encountered exception while closing DataOutputStream of the tokenFile", t); } } if (fos != null) { fos.close(); } } // stash them to cancel after use. logger.info("Tokens loaded in " + tokenFile.getAbsolutePath()); } catch (Exception e) { throw new HadoopSecurityManagerException( "Failed to get hadoop tokens! " + e.getMessage() + e.getCause(), e); } catch (Throwable t) { throw new HadoopSecurityManagerException( "Failed to get hadoop tokens! " + t.getMessage() + t.getCause(), t); } }
From source file:bixo.config.BixoPlatform.java
License:Apache License
public BixoPlatform(Class applicationJarClass, Platform platform) throws Exception { super(applicationJarClass); if (platform == Platform.Local) { _platform = new LocalPlatform(applicationJarClass); setJobPollingInterval(CASCADING_LOCAL_JOB_POLLING_INTERVAL); } else {//from w ww .ja v a 2 s . c o m configureHadoopPlatform(applicationJarClass, new JobConf()); } }
From source file:bixo.examples.crawl.CreateUrlDatumFromOutlinksFunctionTest.java
License:Apache License
@SuppressWarnings("unchecked") @Test/* w w w . j a v a 2 s.com*/ public void testOperate() { CreateUrlDatumFromOutlinksFunction op = new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(), new SimpleUrlValidator()); HadoopFlowProcess fp = Mockito.mock(HadoopFlowProcess.class); Mockito.when(fp.getJobConf()).thenReturn(new JobConf()); OperationCall<NullContext> oc = Mockito.mock(OperationCall.class); FunctionCall<NullContext> fc = Mockito.mock(FunctionCall.class); TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class); Outlink outlink1 = new Outlink("http://bar.com/", "anchorText"); Outlink outlinks[] = { outlink1 }; ParsedDatum datum = new ParsedDatum("http://foo.com/", "foo.com", "parsed text", "en", "title", outlinks, null); datum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, 0); TupleEntry entry = new TupleEntry(ParsedDatum.FIELDS); entry.setTuple(new Tuple(datum.getTuple())); Mockito.when(fc.getArguments()).thenReturn(entry); Mockito.when(fc.getOutputCollector()).thenReturn(collector); op.prepare(fp, oc); op.operate(fp, fc); op.cleanup(fp, oc); Mockito.verify(collector).add(Mockito.argThat(new MatchUrlDatum())); Mockito.verifyNoMoreInteractions(collector); }
From source file:bixo.examples.crawl.DemoCrawlTool.java
License:Apache License
public static void main(String[] args) { DemoCrawlToolOptions options = new DemoCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*from w ww .j ava 2 s . c o m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user want to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); // defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); // this is to cause Bixo to block waiting for next time it can fetch from a particular site. // todo: may not be necessary in future versions of Bixo // defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()])); // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.crawl.DemoCrawlWorkflowLRTest.java
License:Apache License
@Test public void testNotLosingFetchedUrls() throws Throwable { String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output"; JobConf conf = new JobConf(); Path baseDirPath = new Path(baseDirName); FileSystem fs = baseDirPath.getFileSystem(conf); HadoopUtils.safeRemove(fs, baseDirPath); Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0); Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); DemoCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf); curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(1);//from w ww. ja va 2 s .co m defaultPolicy.setFetcherMode(FetcherMode.COMPLETE); BaseUrlFilter urlFilter = new BaseUrlFilter() { @Override public boolean isRemove(UrlDatum datum) { return false; } }; DemoCrawlToolOptions options = new DemoCrawlToolOptions(); options.setUseBoilerpipe(true); UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com"); Server server = null; try { server = startServer(new FakeWebSiteHandler(), 8089); Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update the crawlDb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Now we should have an output/1-<timestamp>/ directory, where the // /urls dir has 11 entries with // one being previously crawled, and the other 10 being pending. Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); TupleEntryIterator iter = crawldbTap.openForRead(conf); int numFetched = 0; int numPending = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int crawlDepth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals(UrlStatus.FETCHED, status); assertEquals(0, crawlDepth); } else { numPending += 1; assertEquals(UrlStatus.UNFETCHED, status); assertEquals(1, crawlDepth); } } assertEquals(1, numFetched); assertEquals(10, numPending); // Do it one more time, to verify status gets propagated forward. curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2); flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Update crawldb path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString()); iter = crawldbTap.openForRead(conf); numFetched = 0; numPending = 0; int numDepth0 = 0; int numDepth1 = 0; int numDepth2 = 0; while (iter.hasNext()) { CrawlDbDatum datum = new CrawlDbDatum(iter.next()); UrlStatus status = datum.getLastStatus(); int depth = datum.getCrawlDepth(); if (datum.getLastFetched() != 0) { numFetched += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status); } else { numPending += 1; assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status); } if (depth == 0) { numDepth0 += 1; } else if (depth == 1) { numDepth1 += 1; } else if (depth == 2) { numDepth2 += 1; } else { fail("Invalid crawl depth for " + datum.getUrl()); } // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d", // datum.getUrl(), datum.getLastStatus(), // datum.getLastFetched(), depth)); } assertEquals(11, numFetched); assertEquals(100, numPending); assertEquals(1, numDepth0); assertEquals(10, numDepth1); assertEquals(100, numDepth2); } catch (Throwable t) { fail(t.getMessage()); } finally { if (server != null) { server.stop(); } } }
From source file:bixo.examples.crawl.DemoStatusTool.java
License:Apache License
public static void main(String[] args) { DemoStatusToolOptions options = new DemoStatusToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//w ww. j ava 2 s. c o m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } String crawlDirName = options.getWorkingDir(); try { JobConf conf = new JobConf(); Path crawlDirPath = new Path(crawlDirName); FileSystem fs = crawlDirPath.getFileSystem(conf); if (!fs.exists(crawlDirPath)) { System.err.println("Prior crawl output directory does not exist: " + crawlDirName); System.exit(-1); } // Skip Hadoop/Cascading DEBUG messages. Logger.getRootLogger().setLevel(Level.INFO); boolean exportDb = options.isExportDb(); if (exportDb) { Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath); processCrawlDb(conf, latestCrawlDirPath, exportDb); } else { int prevLoop = -1; Path curDirPath = null; while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) { String curDirName = curDirPath.toUri().toString(); LOGGER.info(""); LOGGER.info("================================================================"); LOGGER.info("Processing " + curDirName); LOGGER.info("================================================================"); int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath); if (curLoop != prevLoop + 1) { LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop)); } prevLoop = curLoop; // Process the status and crawldb in curPath processStatus(conf, curDirPath); processCrawlDb(conf, curDirPath, exportDb); } } } catch (Throwable t) { LOGGER.error("Exception running tool", t); System.exit(-1); } }
From source file:bixo.examples.crawl.JDBCCrawlTool.java
License:Apache License
public static void main(String[] args) { JDBCCrawlToolOptions options = new JDBCCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/*from w w w . ja v a 2s. c om*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain != null) { validateDomain(domain, parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user is starting from scratch if (options.getDbLocation() == null) { if (fs.exists(outputPath)) { System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName); System.out.println("Warning: Delete the output dir before running"); fs.delete(outputPath, true); } } else { Path dbLocationPath = new Path(options.getDbLocation()); if (!fs.exists(dbLocationPath)) { fs.mkdirs(dbLocationPath); } } if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); if (domain == null) { System.err.println("For a new crawl the domain needs to be specified" + domain); printUsageAndExit(parser); } importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf); } Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (inputPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } int startLoop = CrawlDirUtils.extractLoopNumber(inputPath); int endLoop = startLoop + options.getNumLoops(); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()])); // Now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter, options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation()); flow.complete(); // flow.writeDOT("build/valid-flow.dot"); // Input for the next round is our current output inputPath = curLoopDir; } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } JDBCTapFactory.shutdown(); }