List of usage examples for org.apache.hadoop.fs FileSystem get
public static FileSystem get(Configuration conf) throws IOException
From source file:cascading.platform.hadoop2.Hadoop2MR1Platform.java
License:Open Source License
@Override public synchronized void setUp() throws IOException { if (configuration != null) return;//w ww .j a v a 2 s . c o m if (!isUseCluster()) { LOG.info("not using cluster"); configuration = new JobConf(); // enforce settings to make local mode behave the same across distributions configuration.set("fs.defaultFS", "file:///"); configuration.set("mapreduce.framework.name", "local"); configuration.set("mapreduce.jobtracker.staging.root.dir", System.getProperty("user.dir") + "/" + "build/tmp/cascading/staging"); String stagingDir = configuration.get("mapreduce.jobtracker.staging.root.dir"); if (Util.isEmpty(stagingDir)) configuration.set("mapreduce.jobtracker.staging.root.dir", System.getProperty("user.dir") + "/build/tmp/cascading/staging"); fileSys = FileSystem.get(configuration); } else { LOG.info("using cluster"); if (Util.isEmpty(System.getProperty("hadoop.log.dir"))) System.setProperty("hadoop.log.dir", "build/test/log"); if (Util.isEmpty(System.getProperty("hadoop.tmp.dir"))) System.setProperty("hadoop.tmp.dir", "build/test/tmp"); new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored JobConf conf = new JobConf(); if (!Util.isEmpty(System.getProperty("mapred.jar"))) { LOG.info("using a remote cluster with jar: {}", System.getProperty("mapred.jar")); configuration = conf; ((JobConf) configuration).setJar(System.getProperty("mapred.jar")); if (!Util.isEmpty(System.getProperty("fs.default.name"))) { LOG.info("using {}={}", "fs.default.name", System.getProperty("fs.default.name")); configuration.set("fs.default.name", System.getProperty("fs.default.name")); } if (!Util.isEmpty(System.getProperty("mapred.job.tracker"))) { LOG.info("using {}={}", "mapred.job.tracker", System.getProperty("mapred.job.tracker")); configuration.set("mapred.job.tracker", System.getProperty("mapred.job.tracker")); } if (!Util.isEmpty(System.getProperty("fs.defaultFS"))) { LOG.info("using {}={}", "fs.defaultFS", System.getProperty("fs.defaultFS")); configuration.set("fs.defaultFS", System.getProperty("fs.defaultFS")); } if (!Util.isEmpty(System.getProperty("yarn.resourcemanager.address"))) { LOG.info("using {}={}", "yarn.resourcemanager.address", System.getProperty("yarn.resourcemanager.address")); configuration.set("yarn.resourcemanager.address", System.getProperty("yarn.resourcemanager.address")); } if (!Util.isEmpty(System.getProperty("mapreduce.jobhistory.address"))) { LOG.info("using {}={}", "mapreduce.jobhistory.address", System.getProperty("mapreduce.jobhistory.address")); configuration.set("mapreduce.jobhistory.address", System.getProperty("mapreduce.jobhistory.address")); } configuration.set("mapreduce.user.classpath.first", "true"); // use test dependencies configuration.set("mapreduce.framework.name", "yarn"); fileSys = FileSystem.get(configuration); } else { conf.setBoolean("yarn.is.minicluster", true); // conf.setInt( "yarn.nodemanager.delete.debug-delay-sec", -1 ); // conf.set( "yarn.scheduler.capacity.root.queues", "default" ); // conf.set( "yarn.scheduler.capacity.root.default.capacity", "100" ); // disable blacklisting hosts not to fail localhost during unit tests conf.setBoolean("yarn.app.mapreduce.am.job.node-blacklisting.enable", false); dfs = new MiniDFSCluster(conf, 4, true, null); fileSys = dfs.getFileSystem(); FileSystem.setDefaultUri(conf, fileSys.getUri()); mr = MiniMRClientClusterFactory.create(this.getClass(), 4, conf); configuration = mr.getConfig(); } configuration.set("mapred.child.java.opts", "-Xmx512m"); configuration.setInt("mapreduce.job.jvm.numtasks", -1); configuration.setInt("mapreduce.client.completion.pollinterval", 50); configuration.setInt("mapreduce.client.progressmonitor.pollinterval", 50); configuration.setBoolean("mapreduce.map.speculative", false); configuration.setBoolean("mapreduce.reduce.speculative", false); } configuration.setInt("mapreduce.job.maps", numMappers); configuration.setInt("mapreduce.job.reduces", numReducers); Map<Object, Object> globalProperties = getGlobalProperties(); if (logger != null) globalProperties.put("log4j.logger", logger); FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests Hadoop2MR1Planner.copyProperties(configuration, globalProperties); // copy any external properties Hadoop2MR1Planner.copyConfiguration(properties, configuration); // put all properties on the jobconf }
From source file:cascading.platform.tez.Hadoop2TezPlatform.java
License:Open Source License
@Override public synchronized void setUp() throws IOException { if (configuration != null) return;//from w ww . j a v a 2s . c o m if (!isUseCluster()) { // Current usage requirements: // 1. Clients need to set "tez.local.mode" to true when creating a TezClient instance. (For the examples this can be done via -Dtez.local.mode=true) // 2. fs.defaultFS must be set to "file:///" // 2.1 If running examples - this must be set in tez-site.xml (so that it's picked up by the client, as well as the conf instances used to configure the Inputs / Outputs). // 2.2 If using programatically (without a tez-site.xml present). All configuration instances used (to crate the client / configure Inputs / Outputs) - must have this property set. // 3. tez.runtime.optimize.local.fetch needs to be set to true (either via tez-site.xml or in all configurations used to create the job (similar to fs.defaultFS in step 2)) // 4. tez.staging-dir must be set (either programatically or via tez-site.xml). // Until TEZ-1337 goes in - the staging-dir for the job is effectively the root of the filesystem (and where inputs are read from / written to if relative paths are used). LOG.info("not using cluster"); configuration = new Configuration(); configuration.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions()); // configuration.setInt( FlowRuntimeProps.GATHER_PARTITIONS, 1 ); // deadlocks if larger than 1 configuration.set(TezConfiguration.TEZ_LOCAL_MODE, "true"); configuration.set("fs.defaultFS", "file:///"); configuration.set("tez.runtime.optimize.local.fetch", "true"); // hack to prevent deadlocks where downstream processors are scheduled before upstream configuration.setInt("tez.am.inline.task.execution.max-tasks", 3); // testHashJoinMergeIntoHashJoinAccumulatedAccumulatedMerge fails if set to 2 configuration.set(TezConfiguration.TEZ_IGNORE_LIB_URIS, "true"); // in local mode, use local classpath configuration.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1); configuration.set(TezConfiguration.TEZ_GENERATE_DEBUG_ARTIFACTS, "true"); configuration.set("tez.am.mode.session", "true"); // allows multiple TezClient instances to be used in a single jvm if (!Util.isEmpty(System.getProperty("hadoop.tmp.dir"))) configuration.set("hadoop.tmp.dir", System.getProperty("hadoop.tmp.dir")); else configuration.set("hadoop.tmp.dir", "build/test/tmp"); fileSys = FileSystem.get(configuration); } else { LOG.info("using cluster"); if (Util.isEmpty(System.getProperty("hadoop.log.dir"))) System.setProperty("hadoop.log.dir", "build/test/log"); if (Util.isEmpty(System.getProperty("hadoop.tmp.dir"))) System.setProperty("hadoop.tmp.dir", "build/test/tmp"); new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored new File(System.getProperty("hadoop.tmp.dir")).mkdirs(); // ignored Configuration defaultConf = new Configuration(); defaultConf.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions()); defaultConf.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1); // defaultConf.set( TezConfiguration.TEZ_AM_LOG_LEVEL, "DEBUG" ); // defaultConf.set( TezConfiguration.TEZ_TASK_LOG_LEVEL, "DEBUG" ); defaultConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1); defaultConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false); defaultConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, System.getProperty("hadoop.tmp.dir")); miniDFSCluster = new MiniDFSCluster.Builder(defaultConf).numDataNodes(4).format(true).racks(null) .build(); fileSys = miniDFSCluster.getFileSystem(); Configuration tezConf = new Configuration(defaultConf); tezConf.set("fs.defaultFS", fileSys.getUri().toString()); // use HDFS tezConf.set(MRJobConfig.MR_AM_STAGING_DIR, "/apps_staging_dir"); // see MiniTezClusterWithTimeline as alternate miniTezCluster = new MiniTezCluster(getClass().getName(), 4, 1, 1); // todo: set to 4 miniTezCluster.init(tezConf); miniTezCluster.start(); configuration = miniTezCluster.getConfig(); // stats won't work after completion unless ATS is used if (setTimelineStore(configuration)) // true if ats can be loaded and configured for this hadoop version { configuration.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS, ATSHistoryLoggingService.class.getName()); configuration.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true); configuration.set(YarnConfiguration.TIMELINE_SERVICE_ADDRESS, "localhost:10200"); configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS, "localhost:8188"); configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_HTTPS_ADDRESS, "localhost:8190"); yarnHistoryServer = new ApplicationHistoryServer(); yarnHistoryServer.init(configuration); yarnHistoryServer.start(); } } configuration.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, 1); configuration.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1); configuration.setInt(TezConfiguration.TEZ_AM_MAX_TASK_FAILURES_PER_NODE, 1); Map<Object, Object> globalProperties = getGlobalProperties(); if (logger != null) globalProperties.put("log4j.logger", logger); FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests Hadoop2TezPlanner.copyProperties(configuration, globalProperties); // copy any external properties Hadoop2TezPlanner.copyConfiguration(properties, configuration); // put all properties on the jobconf ExitUtil.disableSystemExit(); // forbidSystemExitCall(); }
From source file:cascading.tap.hadoop.BaseDistCacheTap.java
License:Open Source License
@Override public TupleEntryIterator openForRead(FlowProcess<? extends Configuration> flowProcess, RecordReader input) throws IOException { // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is provided if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) { LOG.info("delegating to parent"); return super.openForRead(flowProcess, input); }// ww w . j av a2 s. co m Path[] cachedFiles = getLocalCacheFiles(flowProcess); if (cachedFiles == null || cachedFiles.length == 0) return super.openForRead(flowProcess, null); List<Path> paths = new ArrayList<>(); List<Tap> taps = new ArrayList<>(); if (isSimpleGlob()) { FileSystem fs = FileSystem.get(flowProcess.getConfig()); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); for (FileStatus status : statuses) paths.add(status.getPath()); } else { paths.add(getHfs().getPath()); } for (Path pathToFind : paths) { for (Path path : cachedFiles) { if (path.toString().endsWith(pathToFind.getName())) { LOG.info("found {} in distributed cache", path); taps.add(new Lfs(getScheme(), path.toString())); } } } if (paths.isEmpty()) // not in cache, read from HDFS { LOG.info("could not find files in local resource path. delegating to parent: {}", super.getIdentifier()); return super.openForRead(flowProcess, input); } return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input); }
From source file:cascading.tap.hadoop.BaseDistCacheTap.java
License:Open Source License
private void registerHfs(FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs) throws IOException { if (isSimpleGlob()) { FileSystem fs = FileSystem.get(conf); FileStatus[] statuses = fs.globStatus(getHfs().getPath()); if (statuses == null || statuses.length == 0) throw new TapException(String.format( "glob expression %s does not match any files on the filesystem", getHfs().getPath())); for (FileStatus fileStatus : statuses) registerURI(conf, fileStatus.getPath()); } else {/*from w w w .j ava 2s.co m*/ registerURI(conf, hfs.getPath()); } hfs.sourceConfInitComplete(process, conf); }
From source file:cascading.tap.hadoop.Hfs.java
License:Open Source License
protected FileSystem getDefaultFileSystem(Configuration configuration) { try {/*from w ww .ja va2 s . c o m*/ return FileSystem.get(configuration); } catch (IOException exception) { throw new TapException("unable to get handle to underlying filesystem", exception); } }
From source file:cascading.tap.hadoop.ZipInputFormatTest.java
License:Open Source License
public void testSplits() throws Exception { JobConf job = new JobConf(); FileSystem currentFs = FileSystem.get(job); Path file = new Path(workDir, "test.zip"); Reporter reporter = Reporter.NULL;/*www .jav a 2 s. co m*/ int seed = new Random().nextInt(); LOG.info("seed = " + seed); Random random = new Random(seed); FileInputFormat.setInputPaths(job, file); for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) { ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream); long length = 0; LOG.debug("creating; zip file with entries = " + entries); // for each entry in the zip file for (int entryCounter = 0; entryCounter < entries; entryCounter++) { // construct zip entries splitting MAX_LENGTH between entries long entryLength = MAX_LENGTH / entries; ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt"); zipEntry.setMethod(ZipEntry.DEFLATED); zos.putNextEntry(zipEntry); for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) { zos.write(Long.toString(length).getBytes()); zos.write("\n".getBytes()); } zos.flush(); zos.closeEntry(); } zos.flush(); zos.close(); currentFs.delete(file, true); OutputStream outputStream = currentFs.create(file); byteArrayOutputStream.writeTo(outputStream); outputStream.close(); ZipInputFormat format = new ZipInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); InputSplit[] splits = format.getSplits(job, 100); BitSet bits = new BitSet((int) length); for (int j = 0; j < splits.length; j++) { LOG.debug("split[" + j + "]= " + splits[j]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter); try { int count = 0; while (reader.next(key, value)) { int v = Integer.parseInt(value.toString()); LOG.debug("read " + v); if (bits.get(v)) LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos()); assertFalse("key in multiple partitions.", bits.get(v)); bits.set(v); count++; } LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count); } finally { reader.close(); } } assertEquals("some keys in no partition.", length, bits.cardinality()); } }
From source file:cascading.tap.Hfs.java
License:Open Source License
protected FileSystem getDefaultFileSystem(JobConf jobConf) throws IOException { return FileSystem.get(jobConf); }
From source file:cascading.tap.hive.HiveTap.java
License:Open Source License
@Override public boolean resourceExists(JobConf conf) throws IOException { IMetaStoreClient metaStoreClient = null; try {/*from w ww. j ava 2s .com*/ metaStoreClient = createMetaStoreClient(); Table table = metaStoreClient.getTable(tableDescriptor.getDatabaseName(), tableDescriptor.getTableName()); modifiedTime = table.getLastAccessTime(); // check if the schema matches the table descriptor. If not, throw an exception. if (strict) { LOG.info("strict mode: comparing existing hive table with table descriptor"); if (!table.getTableType().equals(tableDescriptor.toHiveTable().getTableType())) throw new HiveTableValidationException( String.format("expected a table of type '%s' but found '%s'", tableDescriptor.toHiveTable().getTableType(), table.getTableType())); // Check that the paths are the same FileSystem fs = FileSystem.get(conf); StorageDescriptor sd = table.getSd(); Path expectedPath = fs.makeQualified( new Path(tableDescriptor.getLocation(hiveConf.getVar(ConfVars.METASTOREWAREHOUSE)))); Path actualPath = fs.makeQualified(new Path(sd.getLocation())); if (!expectedPath.equals(actualPath)) throw new HiveTableValidationException( String.format("table in MetaStore does not have the sampe path. Expected %s got %s", expectedPath, actualPath)); List<FieldSchema> schemaList = sd.getCols(); if (schemaList.size() != tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length) throw new HiveTableValidationException(String.format( "table in MetaStore does not have same number of columns. expected %d got %d", tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length, schemaList.size())); for (int index = 0; index < schemaList.size(); index++) { FieldSchema schema = schemaList.get(index); String expectedColumnName = tableDescriptor.getColumnNames()[index]; String expectedColumnType = tableDescriptor.getColumnTypes()[index]; // this could be extended to the StorageDescriptor if necessary. if (!schema.getName().equalsIgnoreCase(expectedColumnName)) throw new HiveTableValidationException( String.format("hive schema mismatch: expected column name '%s', but found '%s'", expectedColumnName, schema.getName())); if (!schema.getType().equalsIgnoreCase(expectedColumnType)) throw new HiveTableValidationException( String.format("hive schema mismatch: expected column type '%s', but found '%s'", expectedColumnType, schema.getType())); } List<FieldSchema> schemaPartitions = table.getPartitionKeys(); if (schemaPartitions.size() != tableDescriptor.getPartitionKeys().length) throw new HiveTableValidationException(String.format( "table in MetaStore does not have same number of partition columns. expected %d got %d", tableDescriptor.getPartitionKeys().length, schemaPartitions.size())); int offset = tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length; for (int index = 0; index < schemaPartitions.size(); index++) { FieldSchema schema = schemaPartitions.get(index); String expectedColumnName = tableDescriptor.getColumnNames()[index + offset]; String expectedColumnType = tableDescriptor.getColumnTypes()[index + offset]; // this could be extended to the StorageDescriptor if necessary. if (!schema.getName().equalsIgnoreCase(expectedColumnName)) throw new HiveTableValidationException(String.format( "hive partition schema mismatch: expected column name '%s', but found '%s'", expectedColumnName, schema.getName())); if (!schema.getType().equalsIgnoreCase(expectedColumnType)) throw new HiveTableValidationException(String.format( "hive partition schema mismatch: expected column type '%s', but found '%s'", expectedColumnType, schema.getType())); } } return true; } catch (MetaException exception) { throw new IOException(exception); } catch (NoSuchObjectException exception) { return false; } catch (TException exception) { throw new IOException(exception); } finally { if (metaStoreClient != null) metaStoreClient.close(); } }
From source file:cc.slda.AnnotateDocuments.java
License:Apache License
/** * Runs this tool./*from w ww. j a v a 2s . com*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); options.addOption(OptionBuilder.withArgName(PCUTOFF).hasArg() .withDescription("probability of topic assignment").create(PCUTOFF)); options.addOption(OptionBuilder.withArgName(INDEX).hasArg() .withDescription("path to data directory containing term and title indices").create(INDEX)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(INDEX)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String indexPath = cmdline.getOptionValue(INDEX); String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; float cutoff = 0.9f; if (cmdline.hasOption(PCUTOFF)) { cutoff = Float.parseFloat(cmdline.getOptionValue(PCUTOFF)); } LOG.info("Tool: " + AnnotateDocuments.class.getSimpleName()); LOG.info(" - indices path: " + indexPath); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); LOG.info(" - log(probCutoff): " + Math.log(cutoff)); Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Job job = Job.getInstance(conf); job.setJobName(AnnotateDocuments.class.getSimpleName()); job.setJarByClass(AnnotateDocuments.class); String termIndex = indexPath + Path.SEPARATOR + TERM; String titleIndex = indexPath + Path.SEPARATOR + TITLE; Path termIndexPath = new Path(termIndex); Path titleIndexPath = new Path(titleIndex); Preconditions.checkArgument(fs.exists(termIndexPath), "Missing term index files... " + termIndexPath); DistributedCache.addCacheFile(termIndexPath.toUri(), job.getConfiguration()); Preconditions.checkArgument(fs.exists(titleIndexPath), "Missing title index files... " + titleIndexPath); DistributedCache.addCacheFile(titleIndexPath.toUri(), job.getConfiguration()); job.setNumReduceTasks(reduceTasks); conf.setFloat(PCUTOFF, cutoff); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(HMapSIW.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(HMapSIW.class); job.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:cc.slda.DisplayTopic.java
License:Apache License
@SuppressWarnings("unchecked") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(Settings.HELP_OPTION, false, "print the help message"); options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg() .withDescription("input beta file").create(Settings.INPUT_OPTION)); options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg() .withDescription("term index file").create(ParseCorpus.INDEX)); options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg() .withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION)); String betaString = null;/* ww w . ja v a 2 s. co m*/ String indexString = null; int topDisplay = TOP_DISPLAY; CommandLineParser parser = new GnuParser(); HelpFormatter formatter = new HelpFormatter(); try { CommandLine line = parser.parse(options, args); if (line.hasOption(Settings.HELP_OPTION)) { formatter.printHelp(ParseCorpus.class.getName(), options); System.exit(0); } if (line.hasOption(Settings.INPUT_OPTION)) { betaString = line.getOptionValue(Settings.INPUT_OPTION); } else { throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION + " not initialized..."); } if (line.hasOption(ParseCorpus.INDEX)) { indexString = line.getOptionValue(ParseCorpus.INDEX); } else { throw new ParseException("Parsing failed due to " + ParseCorpus.INDEX + " not initialized..."); } if (line.hasOption(TOP_DISPLAY_OPTION)) { topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION)); } } catch (ParseException pe) { System.err.println(pe.getMessage()); formatter.printHelp(ParseCorpus.class.getName(), options); System.exit(0); } catch (NumberFormatException nfe) { System.err.println(nfe.getMessage()); System.exit(0); } JobConf conf = new JobConf(DisplayTopic.class); FileSystem fs = FileSystem.get(conf); Path indexPath = new Path(indexString); Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path..."); Path betaPath = new Path(betaString); Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path..."); SequenceFile.Reader sequenceFileReader = null; try { IntWritable intWritable = new IntWritable(); Text text = new Text(); Map<Integer, String> termIndex = new HashMap<Integer, String>(); sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf); while (sequenceFileReader.next(intWritable, text)) { termIndex.put(intWritable.get(), text.toString()); } PairOfIntFloat pairOfIntFloat = new PairOfIntFloat(); // HMapIFW hmap = new HMapIFW(); HMapIDW hmap = new HMapIDW(); TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>(); sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf); while (sequenceFileReader.next(pairOfIntFloat, hmap)) { treeMap.clear(); System.out.println("=============================="); System.out.println( "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement()); System.out.println("=============================="); Iterator<Integer> itr1 = hmap.keySet().iterator(); int temp1 = 0; while (itr1.hasNext()) { temp1 = itr1.next(); treeMap.put(-hmap.get(temp1), temp1); if (treeMap.size() > topDisplay) { treeMap.remove(treeMap.lastKey()); } } Iterator<Double> itr2 = treeMap.keySet().iterator(); double temp2 = 0; while (itr2.hasNext()) { temp2 = itr2.next(); if (termIndex.containsKey(treeMap.get(temp2))) { System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2); } else { System.out.println("How embarrassing! Term index not found..."); } } } } finally { IOUtils.closeStream(sequenceFileReader); } return 0; }