List of usage examples for org.apache.hadoop.fs Path Path
public Path(URI aUri)
From source file:WordCount_SiCombiner.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);// ww w . j a v a 2 s . c o m } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount_SiCombiner.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); // disable combiner // job.setCombinerClass(IntSumReducer.class); job.setPartitionerClass(WordPartition.class); job.setNumReduceTasks(5); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:BooleanRetrievalCompressed.java
License:Apache License
private void initialize(String indexPath, String collectionPath, FileSystem fs) throws IOException { index = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), fs.getConf()); collection = fs.open(new Path(collectionPath)); stack = new Stack<Set<Integer>>(); }
From source file:HdfsReader.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("HdfsReader [FileSize i.e. 1g/10g/100g/200g]"); return 1; }/*from w w w . j a va2s. c o m*/ double fileSize; double fileSizeInMB; if (args[0].equals("1g")) { fileSize = 1073741824.0; fileSizeInMB = 1024.0; } else if (args[0].equals("10g")) { fileSize = 10737418240.0; fileSizeInMB = 10240.0; } else if (args[0].equals("100g")) { fileSize = 107374182400.0; fileSizeInMB = 102400.0; } else if (args[0].equals("200g")) { fileSize = 214748364800.0; fileSizeInMB = 204800.0; } else { throw new IllegalArgumentException("Invalid arg: " + args[0]); } String fileName = "read-" + args[0] + "-avg.txt"; File avgFile = new File(fileName); PrintWriter avgPW = new PrintWriter(avgFile); fileName = "read-" + args[0] + "-min.txt"; File minFile = new File(fileName); PrintWriter minPW = new PrintWriter(minFile); fileName = "read-" + args[0] + "-max.txt"; File maxFile = new File(fileName); PrintWriter maxPW = new PrintWriter(maxFile); int numIters = 10; int bufferSize = 4096; long blockSize[] = new long[] { 67108864, 134217728, 268435456, 536870912, 1073741824 }; short replication[] = new short[] { 1, 4 }; String hdfsFile = "/hdfs_test/" + args[0] + "/1.in"; Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path hdfsFilePath = new Path(hdfsFile); for (int i = 0; i < 5; i++) { // blockSize for (int j = 0; j < 2; j++) { // replication OutputStream os = fs.create(hdfsFilePath, true, bufferSize, replication[j], blockSize[i]); byte[] buf = new byte[bufferSize]; for (int m = 0; m < bufferSize; m += 4) { buf[m] = (byte) m; } double numBufPerFile = fileSize / (double) bufferSize; for (double m = 0.0; m < numBufPerFile; m++) { os.write(buf); } os.close(); long avg = 0, min = Long.MAX_VALUE, max = Long.MIN_VALUE; for (int k = 0; k < numIters; k++) { InputStream is = fs.open(hdfsFilePath); long startTime = System.currentTimeMillis(); int bytesRead = is.read(buf); while (bytesRead != -1) { bytesRead = is.read(buf); } is.close(); long endTime = System.currentTimeMillis(); long duration = (endTime - startTime); avg += duration; if (duration < min) { min = duration; } if (duration > max) { max = duration; } } // write result to output double avgBW = fileSizeInMB * 1000.0 * (double) numIters / (double) avg; avgPW.print(avgBW); avgPW.print("\t"); double minBW = fileSizeInMB * 1000.0 / (double) max; minPW.print(minBW); minPW.print("\t"); double maxBW = fileSizeInMB * 1000.0 / (double) min; maxPW.print(maxBW); maxPW.print("\t"); } avgPW.println(); minPW.println(); maxPW.println(); } avgPW.close(); minPW.close(); maxPW.close(); return 0; }
From source file:GetRetweetersAndCountPerUser.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: GetRetweetersAndCountPerUser <in> <out> <num_reducers>"); System.exit(2);/*from ww w .j a va 2 s.c o m*/ } Job job = new Job(conf, "word count"); job.setJarByClass(RetweetersPerUser.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); System.out.println(otherArgs[0]); job.setMapperClass(TweetMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(Integer.parseInt(args[2])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); if (job.waitForCompletion(true)) { FileSystem hdfs = FileSystem.get(new URI(args[1]), conf); Path dir = new Path(args[1]); PathFilter filter = new PathFilter() { public boolean accept(Path file) { return file.getName().startsWith("part-r-"); } }; HashMap<Integer, Integer> counts_for_user = new HashMap<Integer, Integer>(); FileStatus[] files = hdfs.listStatus(dir, filter); Arrays.sort(files); for (int i = 0; i != files.length; i++) { Path pt = files[i].getPath(); BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt))); String line = null; while ((line = br.readLine()) != null) { String[] columns = new String[2]; columns = line.split("\t"); int key = Integer.parseInt(columns[0]); if (counts_for_user.containsKey(key)) counts_for_user.put(key, counts_for_user.get(key) + 1); else counts_for_user.put(key, 1); } br.close(); } FSDataOutputStream fsDataOutputStream = hdfs.create(new Path(otherArgs[1] + "_count")); PrintWriter writer = new PrintWriter(fsDataOutputStream); for (Entry<Integer, Integer> e : counts_for_user.entrySet()) { writer.write(e.getKey() + "\t" + e.getValue() + "\n"); } writer.close(); fsDataOutputStream.close(); hdfs.close(); System.exit(0); } System.exit(1); }
From source file:WordCount_PerMapTally.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w w w.j av a 2 s .c om*/ } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount_PerMapTally.class); job.setMapperClass(TokenizerMapper.class); // disable combiner // job.setCombinerClass(IntSumReducer.class); job.setPartitionerClass(WordPartitioner.class); job.setNumReduceTasks(5); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:SiCombiner.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w w w . j a v a 2 s . c o m*/ } Job job = new Job(conf, "word count"); job.setJarByClass(SiCombiner.class); job.setMapperClass(TokenizerMapper.class); // Aniket changes starts /* Here the partitioner is being called*/ job.setPartitionerClass(WordPartitioner.class); // Aniket changes ends job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:AnalyzeBigramRelativeFrequencyTuple.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 1) { System.out.println("usage: [input-path]"); System.exit(-1);//from www .j ava2s.c om } System.out.println("input path: " + args[0]); List<PairOfWritables<Tuple, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(args[0])); List<PairOfWritables<Tuple, FloatWritable>> list1 = Lists.newArrayList(); List<PairOfWritables<Tuple, FloatWritable>> list2 = Lists.newArrayList(); for (PairOfWritables<Tuple, FloatWritable> p : pairs) { Tuple bigram = p.getLeftElement(); if (bigram.get(0).equals("light")) { list1.add(p); } if (bigram.get(0).equals("contain")) { list2.add(p); } } Collections.sort(list1, new Comparator<PairOfWritables<Tuple, FloatWritable>>() { @SuppressWarnings("unchecked") public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) { if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) { return e1.getLeftElement().compareTo(e2.getLeftElement()); } return e2.getRightElement().compareTo(e1.getRightElement()); } }); int i = 0; for (PairOfWritables<Tuple, FloatWritable> p : list1) { Tuple bigram = p.getLeftElement(); System.out.println(bigram + "\t" + p.getRightElement()); i++; if (i > 10) { break; } } Collections.sort(list2, new Comparator<PairOfWritables<Tuple, FloatWritable>>() { @SuppressWarnings("unchecked") public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) { if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) { return e1.getLeftElement().compareTo(e2.getLeftElement()); } return e2.getRightElement().compareTo(e1.getRightElement()); } }); i = 0; for (PairOfWritables<Tuple, FloatWritable> p : list2) { Tuple bigram = p.getLeftElement(); System.out.println(bigram + "\t" + p.getRightElement()); i++; if (i > 10) { break; } } }
From source file:PerMapTally.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);//from w w w .j a va 2 s . c o m } Job job = new Job(conf, "word count"); job.setJarByClass(PerMapTally.class); job.setMapperClass(TokenizerMapper.class); // Aniket changes starts /* Here the partitioner is being called*/ job.setPartitionerClass(WordPartitioner.class); // Aniket changes ends // Part 3 Aniket changes starts /* Here I am just disabling the combiner */ // job.setCombinerClass(IntSumReducer.class); // Part 3 Aniket changes ends job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:ImportTsv.java
License:Apache License
/** * Sets up the actual job./* w w w . j a v a 2s.co m*/ * * @param conf The current configuration. * @param args The command line parameters. * @return The newly created job. * @throws IOException When setting up the job fails. */ public static Job createSubmittableJob(Configuration conf, String[] args) throws IOException, ClassNotFoundException { Job job = null; try (Connection connection = ConnectionFactory.createConnection(conf)) { try (Admin admin = connection.getAdmin()) { // Support non-XML supported characters // by re-encoding the passed separator as a Base64 string. String actualSeparator = conf.get(SEPARATOR_CONF_KEY); if (actualSeparator != null) { conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes())); } // See if a non-default Mapper was set String mapperClassName = conf.get(MAPPER_CONF_KEY); Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER; TableName tableName = TableName.valueOf(args[0]); Path inputDir = new Path(args[1]); // set filter conf.set(EASTCOM_FILTER_PARAMS, args[3]); conf.set(EASTCOM_FILTER_DEFINE, args[4]); String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName.getNameAsString()); job = Job.getInstance(conf, jobName); job.setJarByClass(mapperClass); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(mapperClass); String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY); String columns[] = conf.getStrings(COLUMNS_CONF_KEY); if (StringUtils.isNotEmpty(conf.get(CREDENTIALS_LOCATION))) { String fileLoc = conf.get(CREDENTIALS_LOCATION); Credentials cred = Credentials.readTokenStorageFile(new File(fileLoc), conf); job.getCredentials().addAll(cred); } if (hfileOutPath != null) { if (!admin.tableExists(tableName)) { String errorMsg = format("Table '%s' does not exist.", tableName); if ("yes".equalsIgnoreCase(conf.get(CREATE_TABLE_CONF_KEY, "yes"))) { LOG.warn(errorMsg); // TODO: this is backwards. Instead of depending on the existence of a table, // create a sane splits file for HFileOutputFormat based on data sampling. createTable(admin, tableName, columns); } else { LOG.error(errorMsg); throw new TableNotFoundException(errorMsg); } } try (HTable table = (HTable) connection.getTable(tableName)) { boolean noStrict = conf.getBoolean(NO_STRICT_COL_FAMILY, false); // if no.strict is false then check column family if (!noStrict) { ArrayList<String> unmatchedFamilies = new ArrayList<String>(); Set<String> cfSet = getColumnFamilies(columns); HTableDescriptor tDesc = table.getTableDescriptor(); for (String cf : cfSet) { if (tDesc.getFamily(Bytes.toBytes(cf)) == null) { unmatchedFamilies.add(cf); } } if (unmatchedFamilies.size() > 0) { ArrayList<String> familyNames = new ArrayList<String>(); for (HColumnDescriptor family : table.getTableDescriptor().getFamilies()) { familyNames.add(family.getNameAsString()); } String msg = "Column Families " + unmatchedFamilies + " specified in " + COLUMNS_CONF_KEY + " does not match with any of the table " + tableName + " column families " + familyNames + ".\n" + "To disable column family check, use -D" + NO_STRICT_COL_FAMILY + "=true.\n"; usage(msg); System.exit(-1); } } job.setReducerClass(PutSortReducer.class); Path outputDir = new Path(hfileOutPath); FileOutputFormat.setOutputPath(job, outputDir); job.setMapOutputKeyClass(ImmutableBytesWritable.class); if (mapperClass.equals(TsvImporterTextMapper.class)) { job.setMapOutputValueClass(Text.class); job.setReducerClass(TextSortReducer.class); } else { job.setMapOutputValueClass(Put.class); job.setCombinerClass(PutCombiner.class); } HFileOutputFormat2.configureIncrementalLoad(job, table, table); } } else { if (!admin.tableExists(tableName)) { String errorMsg = format("Table '%s' does not exist.", tableName); LOG.error(errorMsg); throw new TableNotFoundException(errorMsg); } if (mapperClass.equals(TsvImporterTextMapper.class)) { usage(TsvImporterTextMapper.class.toString() + " should not be used for non bulkloading case. use " + TsvImporterMapper.class.toString() + " or custom mapper whose value type is Put."); System.exit(-1); } // No reducers. Just write straight to table. Call initTableReducerJob // to set up the TableOutputFormat. TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job); job.setNumReduceTasks(0); } TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.addDependencyJars(job.getConfiguration(), com.google.common.base.Function.class /* Guava used by TsvParser */); } } return job; }
From source file:Relevance.java
License:Apache License
/** * Exact relevance is slower, non-exact relevance will have false positives *//* w ww . j a va2 s. co m*/ protected void batch_query(Tap source, Tap output, Fields wantedFields, RelevanceFunction func, Tap keysTap, String keyField, boolean useBloom, int bloom_bits, int bloom_hashes, boolean exact) throws IOException { if (!useBloom && !exact) throw new IllegalArgumentException("Must either use bloom filter or be exact, or both!"); FileSystem fs = FileSystem.get(new Configuration()); Pipe finalPipe = new Pipe("data"); finalPipe = new Each(finalPipe, wantedFields, new Identity()); Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put("data", source); Map properties = new HashMap(); String bloomFilterPath = "/tmp/" + UUID.randomUUID().toString() + ".bloomfilter"; if (useBloom) { String jobId = UUID.randomUUID().toString(); LOG.info("Creating bloom filter"); writeOutBloomFilter(keysTap, keyField, fs, bloomFilterPath, bloom_bits, bloom_hashes); properties.put("mapred.job.reuse.jvm.num.tasks", -1); if (!TEST_MODE) { properties.put("mapred.cache.files", "hdfs://" + bloomFilterPath); } else { properties.put("batch_query.relevance.file", bloomFilterPath); } LOG.info("Done creating bloom filter"); finalPipe = new Each(finalPipe, wantedFields, getRelevanceFilter(func, jobId)); } if (exact) { sources.put("relevant", keysTap); Pipe relevantRecords = new Pipe("relevant"); relevantRecords = new Each(relevantRecords, new Fields(keyField), new Identity()); finalPipe = new Each(finalPipe, wantedFields, getExactFilter(func), Fields.join(wantedFields, new Fields(ID, RELEVANT_OBJECT))); finalPipe = new CoGroup(finalPipe, new Fields(RELEVANT_OBJECT), relevantRecords, new Fields(keyField), Fields.join(wantedFields, new Fields(ID, RELEVANT_OBJECT), new Fields("__ignored"))); finalPipe = new Each(finalPipe, Fields.join(wantedFields, new Fields(ID)), new Identity()); if (func.canHaveMultipleMatches()) { finalPipe = new Distinct(finalPipe, new Fields(ID)); } finalPipe = new Each(finalPipe, wantedFields, new Identity()); } Flow flow = new FlowConnector(properties).connect("Relevance: " + func.getClass().getSimpleName(), sources, output, finalPipe); flow.complete(); if (useBloom) fs.delete(new Path(bloomFilterPath), false); }