List of usage examples for org.apache.hadoop.fs Path Path
public Path(URI aUri)
From source file:ColumnStorageBasicTest.java
License:Open Source License
public void testConstructorFieldInDiffFile() { try {//from w w w.java 2 s.c o m Configuration conf = new Configuration(); Path path = new Path(prefix); FileSystem fs = FileSystem.get(conf); ArrayList<Short> idxs = new ArrayList<Short>(10); idxs.add((short) 0); idxs.add((short) 7); idxs.add((short) 4); ColumnStorageClient client = new ColumnStorageClient(path, idxs, conf); if (client.cp == null) { fail("cp null"); } if (client.fds.length != 3) { fail("error fds.len:" + client.fds.length); } for (int i = 0; i < client.fds.length; i++) { if (client.fds[i] == null) { fail("null fd:" + i); } } if (client.list.size() != 3) { fail("error list size:" + client.list.size()); } if (!client.list.get(0).equals(byteFileName)) { fail("error filename:" + client.list.get(0)); } if (!client.list.get(1).equals(multiFileNameString)) { fail("error filename:" + client.list.get(1)); } if (!client.list.get(2).equals(floatFileName)) { fail("error filename:" + client.list.get(2)); } } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }
From source file:ColumnStorageBasicTest.java
License:Open Source License
public void testGetRecordByLine() { try {//from ww w. j av a 2s . com Configuration conf = new Configuration(); Path path = new Path(prefix); FileSystem fs = FileSystem.get(conf); ArrayList<Short> idxs = new ArrayList<Short>(10); idxs.add((short) 0); idxs.add((short) 7); idxs.add((short) 4); ColumnStorageClient client = new ColumnStorageClient(path, idxs, conf); Record record = client.getRecordByLine(-1); if (record != null) { fail("should return null record 1"); } record = client.getRecordByLine(10); if (record != null) { fail("should return null record 2"); } for (int i = 0; i < 10; i++) { record = client.getRecordByLine(i); if (record == null) { fail("should not return null record"); } if (record.fieldValues().size() != 5) { fail("error field num:" + record.fieldValues().size()); } record.show(); judgeNofixRecord(record, i); } } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }
From source file:MaleUsersBelow7Years.java
public static void main(String args[]) throws Exception { Configuration configuration = new Configuration(); Job job = new Job(configuration, "CountMaleUsersLessThan7"); job.setJarByClass(MaleUsersBelow7Years.class); job.setMapperClass(Map.class); job.setReducerClass(Reducer.class); job.setCombinerClass(Reducer.class); //set output and input formats;mapper-input reducer-output job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); //path for input file FileOutputFormat.setOutputPath(job, new Path(args[1])); // Path for output file System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:update_sentiment.java
License:LGPL
public static void runjob(String input, String output) throws Exception { JobConf conf = new JobConf(update_sentiment.class); conf.setJobName("Update_Sentiment_Train"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); JobClient.runJob(conf);/*from w ww . j a v a 2 s. c om*/ }
From source file:PostgresClassifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/* w w w.j a v a2 s.c o m*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [input postgres table]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection c = null; Statement stmt = null; Statement stmtU = null; try { Class.forName("org.postgresql.Driver"); c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); c.setAutoCommit(false); System.out.println("Opened database successfully"); stmt = c.createStatement(); stmtU = c.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM " + tablename + " WHERE rep is null"); while (rs.next()) { String seq = rs.getString("seq"); //String rep = rs.getString("rep"); String body = rs.getString("body"); //String category = rep; String id = seq; String message = body; //System.out.println("Doc: " + id + "\t" + message); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Mark : Modified ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } //System.out.print(" " + labels.get(categoryId) + ": " + score); } //System.out.println(" => " + labels.get(bestCategoryId)); //System.out.println("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id ); stmtU.executeUpdate("UPDATE " + tablename + " SET rep = '" + labels.get(bestCategoryId) + "' WHERE seq = " + id); } rs.close(); stmt.close(); stmtU.close(); c.commit(); c.close(); analyzer.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } }
From source file:PrimeDivisor.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);//from www. j av a 2 s . co m } Job job = new Job(conf, "word count"); job.setJarByClass(PrimeDivisor.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:HadoopWordCount.java
License:Open Source License
public static void main(String[] args) throws Exception { System.setProperty("hazelcast.logging.type", "log4j"); Path inputPath = new Path(HadoopWordCount.class.getClassLoader().getResource("books").getPath()); Path outputPath = new Path(OUTPUT_PATH); // set up the Hadoop job config, the input and output paths and formats JobConf jobConfig = new JobConf(); jobConfig.setInputFormat(TextInputFormat.class); jobConfig.setOutputFormat(TextOutputFormat.class); TextOutputFormat.setOutputPath(jobConfig, outputPath); TextInputFormat.addInputPath(jobConfig, inputPath); // Delete the output directory, if already exists FileSystem.get(new Configuration()).delete(outputPath, true); JetConfig cfg = new JetConfig(); cfg.setInstanceConfig(new InstanceConfig() .setCooperativeThreadCount(Math.max(1, getRuntime().availableProcessors() / 2))); JetInstance jetInstance = Jet.newJetInstance(cfg); Jet.newJetInstance(cfg);/* ww w . ja va 2 s . co m*/ try { System.out.print("\nCounting words from " + inputPath); long start = nanoTime(); jetInstance.newJob(buildDag(jobConfig)).execute().get(); System.out.print("Done in " + NANOSECONDS.toMillis(nanoTime() - start) + " milliseconds."); System.out.println("Output written to " + outputPath); } finally { Jet.shutdownAll(); } }
From source file:BMTTableLoader.java
License:Apache License
public int run(String[] args) { JobConf conf = new JobConf(getConf(), BMTTableLoader.class); GenericOptionsParser parser = new GenericOptionsParser(conf, args); conf.setJobName("BMTTableLoader"); conf.setMapperClass(Map.class); conf.setNumReduceTasks(0);/*from ww w . j a v a 2s . com*/ conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { other_args.add(args[i]); } FileInputFormat.setInputPaths(conf, new Path(other_args.get(0))); FileOutputFormat.setOutputPath(conf, new Path(other_args.get(1))); try { JobClient.runJob(conf); } catch (IOException e) { throw new RuntimeException(e); } return 0; }
From source file:WordCount.java
License:Apache License
public int run(String[] args) throws Exception { ///start/*from www. ja v a 2 s . c o m*/ final long startTime = System.currentTimeMillis(); String outputReducerType = "filesystem"; if (args != null && args[0].startsWith(OUTPUT_REDUCER_VAR)) { String[] s = args[0].split("="); if (s != null && s.length == 2) outputReducerType = s[1]; } logger.info("output reducer type: " + outputReducerType); // use a smaller page size that doesn't divide the row count evenly to exercise the paging logic better ConfigHelper.setRangeBatchSize(getConf(), 99); for (int i = 0; i < WordCountSetup.TEST_COUNT; i++) { String columnName = "userId"; Job job = new Job(getConf(), "wordcount"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); if (outputReducerType.equalsIgnoreCase("filesystem")) { job.setReducerClass(ReducerToFilesystem.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH_PREFIX + i)); } else { job.setReducerClass(ReducerToCassandra.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); job.setOutputFormatClass(ColumnFamilyOutputFormat.class); ConfigHelper.setOutputColumnFamily(job.getConfiguration(), KEYSPACE, OUTPUT_COLUMN_FAMILY); job.getConfiguration().set(CONF_COLUMN_NAME, "sum"); } job.setInputFormatClass(ColumnFamilyInputFormat.class); ConfigHelper.setInputRpcPort(job.getConfiguration(), "9160"); ConfigHelper.setInputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setInputPartitioner(job.getConfiguration(), "RandomPartitioner"); ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY); SlicePredicate predicate = new SlicePredicate() .setColumn_names(Arrays.asList(ByteBufferUtil.bytes(columnName))); ConfigHelper.setInputSlicePredicate(job.getConfiguration(), predicate); // this will cause the predicate to be ignored in favor of scanning everything as a wide row //Son degisiklik // ConfigHelper.setInputColumnFamily(job.getConfiguration(), KEYSPACE, COLUMN_FAMILY, true); //System.out.println("tessssssaaat"); ConfigHelper.setOutputInitialAddress(job.getConfiguration(), "localhost"); ConfigHelper.setOutputPartitioner(job.getConfiguration(), "RandomPartitioner"); job.waitForCompletion(true); } //print final double duration = (System.currentTimeMillis() - startTime) / 1000.0; // after System.out.println(); System.out.println("Job Finished in " + duration + " seconds"); System.out.println(); return 0; }
From source file:ComputeCooccurrenceMatrixPairs.java
License:Apache License
/** * Runs this tool./*w w w . j a va 2s. c o m*/ */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("window size").create(WINDOW)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; int window = cmdline.hasOption(WINDOW) ? Integer.parseInt(cmdline.getOptionValue(WINDOW)) : 2; LOG.info("Tool: " + ComputeCooccurrenceMatrixPairs.class.getSimpleName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - window: " + window); LOG.info(" - number of reducers: " + reduceTasks); Job job = Job.getInstance(getConf()); job.setJobName(ComputeCooccurrenceMatrixPairs.class.getSimpleName()); job.setJarByClass(ComputeCooccurrenceMatrixPairs.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); job.getConfiguration().setInt("window", window); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(PairOfStrings.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(PairOfStrings.class); job.setOutputValueClass(IntWritable.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); System.out.println("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }