List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.conversantmedia.mapreduce.tool.DistributedResourceManager.java
License:Apache License
private static Object getResourceValue(Field field, String valueString, String originalTypeClassname, Path[] distFiles) throws IOException, ClassNotFoundException { // First, determine our approach: Object value = null;/* w w w . j a va 2 s .com*/ if (field.getType().isAssignableFrom(String.class)) { value = valueString; } else if (ClassUtils.isPrimitiveOrWrapper(field.getType())) { value = ConvertUtils.convert(valueString, field.getType()); } else { Path path = distributedFilePath(valueString, distFiles); // This is something on the distributed cache (or illegal) if (field.getType() == Path.class) { value = path; } else if (field.getType() == File.class) { value = new File(path.toUri()); } // Deserialize .ser file else if (field.getType().isAssignableFrom(Class.forName(originalTypeClassname))) { ObjectInputStream in = null; try { File beanSerFile = new File(path.toUri().getPath()); FileInputStream fileIn = new FileInputStream(beanSerFile); in = new ObjectInputStream(fileIn); value = in.readObject(); } finally { IOUtils.closeQuietly(in); } } else { throw new IllegalArgumentException("Cannot locate resource for field [" + field.getName() + "]"); } } return value; }
From source file:com.dasasian.chok.command.ListIndicesCommand.java
License:Apache License
private long calculateIndexDiskUsage(String index) { Path indexPath = new Path(index); URI indexUri = indexPath.toUri(); try {// w w w .j av a 2s. c o m FileSystem fileSystem = FileSystem.get(indexUri, new Configuration()); if (!fileSystem.exists(indexPath)) { return -1; } return fileSystem.getContentSummary(indexPath).getLength(); } catch (Exception e) { return -1; } }
From source file:com.dasasian.chok.util.HadoopUtil.java
License:Apache License
public static FileSystem getFileSystem(Path path) throws IOException { synchronized (FileSystem.class) { // had once a ConcurrentModificationException return FileSystem.get(path.toUri(), new Configuration()); }//w w w .j av a 2 s. c o m }
From source file:com.datasalt.pangool.examples.avro.TestAvroTopicalWordCount.java
License:Apache License
public static int assertOutput(String output, Configuration conf) throws NumberFormatException, IOException, InterruptedException { int validatedOutputLines = 0; Path outPath = new Path(output); TupleFile.Reader reader = new TupleFile.Reader(FileSystem.get(outPath.toUri(), conf), conf, outPath); Tuple tuple = new Tuple(reader.getSchema()); while (reader.next(tuple)) { Record record = (Record) tuple.get("my_avro"); int topicId = (Integer) record.get("topic"); String word = (record.get("word")).toString(); int count = (Integer) record.get("count"); if (topicId == 1) { if (word.equals("bar") || word.equals("foo")) { assertEquals(2, count);//from ww w . java 2 s .c o m validatedOutputLines++; } else if (word.equals("blah") || word.equals("bloh")) { assertEquals(1, count); validatedOutputLines++; } } else if (topicId == 2) { if (word.equals("bar")) { assertEquals(2, count); validatedOutputLines++; } else if (word.equals("bor")) { assertEquals(1, count); validatedOutputLines++; } } } return validatedOutputLines; }
From source file:com.datasalt.pangool.examples.topicalwordcount.TestTopicalWordCount.java
License:Apache License
public static int assertOutput(String output, Configuration conf) throws NumberFormatException, IOException, InterruptedException { int validatedOutputLines = 0; Path outPath = new Path(output); TupleFile.Reader reader = new TupleFile.Reader(FileSystem.get(outPath.toUri(), conf), conf, outPath); Tuple tuple = new Tuple(reader.getSchema()); while (reader.next(tuple)) { int topicId = (Integer) tuple.get("topic"); String word = ((Utf8) tuple.get("word")).toString(); int count = (Integer) tuple.get("count"); if (topicId == 1) { if (word.equals("bar") || word.equals("foo")) { assertEquals(2, count);//from w w w.ja va 2s .com validatedOutputLines++; } else if (word.equals("blah") || word.equals("bloh")) { assertEquals(1, count); validatedOutputLines++; } } else if (topicId == 2) { if (word.equals("bar")) { assertEquals(2, count); validatedOutputLines++; } else if (word.equals("bor")) { assertEquals(1, count); validatedOutputLines++; } } } reader.close(); return validatedOutputLines; }
From source file:com.datasalt.pangool.examples.topicalwordcount.TestTopicFingerprint.java
License:Apache License
@Test public void test() throws Exception { trash(OUTPUT);// ww w. j a va2s .co m Configuration conf = new Configuration(); createInput(INPUT, conf); ToolRunner.run(getConf(), new TopicFingerprint(), new String[] { INPUT, OUTPUT, 2 + "" }); Path outPath = new Path(OUTPUT + "/part-r-00000"); FileSystem fs = FileSystem.get(outPath.toUri(), conf); TupleFile.Reader reader = new TupleFile.Reader(fs, conf, outPath); Tuple tuple = new Tuple(reader.getSchema()); // The order in the output file is deterministic (we have sorted by topic, count) reader.next(tuple); assertEquals(1, tuple.get("topic")); assertEquals("a", tuple.get("word").toString()); reader.next(tuple); assertEquals(1, tuple.get("topic")); assertEquals("c", tuple.get("word").toString()); reader.next(tuple); assertEquals(2, tuple.get("topic")); assertEquals("a", tuple.get("word").toString()); reader.next(tuple); assertEquals(2, tuple.get("topic")); assertEquals("b", tuple.get("word").toString()); // Check the named output reader.close(); outPath = new Path(OUTPUT + "/" + TopicFingerprint.OUTPUT_TOTALCOUNT + "/" + "part-r-00000"); reader = new TupleFile.Reader(fs, conf, outPath); tuple = new Tuple(reader.getSchema()); reader.next(tuple); assertEquals(1, tuple.get("topic")); assertEquals(15, tuple.get("totalcount")); reader.next(tuple); assertEquals(2, tuple.get("topic")); assertEquals(19, tuple.get("totalcount")); reader.close(); trash(INPUT, OUTPUT); }
From source file:com.datasalt.pangool.examples.topicalwordcount.TestTopicFingerprint.java
License:Apache License
public void createInput(String input, Configuration conf) throws IOException, InterruptedException { Path inPath = new Path(input); FileSystem fs = FileSystem.get(inPath.toUri(), conf); TupleFile.Writer writer = new TupleFile.Writer(fs, conf, inPath, TopicalWordCount.getSchema()); // Topic 1, words: { a, 10 } { b, 1 } , { c, 5 } // Top 2 words = a(10), c(5) ITuple tuple = new Tuple(TopicalWordCount.getSchema()); tuple.set("word", "a"); tuple.set("topic", 1); tuple.set("count", 10); writer.append(tuple);/*from w ww . j a v a 2 s. c o m*/ tuple.set("word", "b"); tuple.set("topic", 1); tuple.set("count", 1); writer.append(tuple); tuple.set("word", "c"); tuple.set("topic", 1); tuple.set("count", 5); writer.append(tuple); // Topic 2, words: { a, 10 } { b, 9 } , { c, 5 } // Top 2 words = a(10), b(9) tuple.set("word", "a"); tuple.set("topic", 2); tuple.set("count", 10); writer.append(tuple); tuple.set("word", "b"); tuple.set("topic", 2); tuple.set("count", 9); writer.append(tuple); tuple.set("word", "c"); tuple.set("topic", 2); tuple.set("count", 5); writer.append(tuple); writer.close(); }
From source file:com.datasalt.pangool.solr.SolrRecordWriter.java
License:Apache License
/** * Write a file to a zip output stream, removing leading path name components from the actual file name when creating * the zip file entry.//from ww w .j av a 2 s. com * * The entry placed in the zip file is <code>baseName</code>/ <code>relativePath</code>, where * <code>relativePath</code> is constructed by removing a leading <code>root</code> from the path for * <code>itemToZip</code>. * * If <code>itemToZip</code> is an empty directory, it is ignored. If <code>itemToZip</code> is a directory, the * contents of the directory are added recursively. * * @param zos * The zip output stream * @param baseName * The base name to use for the file name entry in the zip file * @param root * The path to remove from <code>itemToZip</code> to make a relative path name * @param itemToZip * The path to the file to be added to the zip file * @return the number of entries added * @throws IOException */ static public int zipDirectory(final Configuration conf, final ZipOutputStream zos, final String baseName, final String root, final Path itemToZip) throws IOException { LOG.info(String.format("zipDirectory: %s %s %s", baseName, root, itemToZip)); LocalFileSystem localFs = FileSystem.getLocal(conf); int count = 0; final FileStatus itemStatus = localFs.getFileStatus(itemToZip); if (itemStatus.isDir()) { final FileStatus[] statai = localFs.listStatus(itemToZip); // Add a directory entry to the zip file final String zipDirName = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root); final ZipEntry dirZipEntry = new ZipEntry(zipDirName + Path.SEPARATOR_CHAR); LOG.info(String.format("Adding directory %s to zip", zipDirName)); zos.putNextEntry(dirZipEntry); zos.closeEntry(); count++; if (statai == null || statai.length == 0) { LOG.info(String.format("Skipping empty directory %s", itemToZip)); return count; } for (FileStatus status : statai) { count += zipDirectory(conf, zos, baseName, root, status.getPath()); } LOG.info(String.format("Wrote %d entries for directory %s", count, itemToZip)); return count; } final String inZipPath = relativePathForZipEntry(itemToZip.toUri().getPath(), baseName, root); if (inZipPath.length() == 0) { LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)", itemToZip, root, baseName)); return 0; } // Take empty files in case the place holder is needed FSDataInputStream in = null; try { in = localFs.open(itemToZip); final ZipEntry ze = new ZipEntry(inZipPath); ze.setTime(itemStatus.getModificationTime()); // Comments confuse looking at the zip file // ze.setComment(itemToZip.toString()); zos.putNextEntry(ze); IOUtils.copyBytes(in, zos, conf, false); zos.closeEntry(); LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip)); return 1; } finally { in.close(); } }
From source file:com.datasalt.pangool.utils.DCUtils.java
License:Apache License
/** * Utility method for serializing an object and saving it in the Distributed Cache. * <p>/* ww w . j ava 2 s .c om*/ * The file where it has been serialized will be saved into a Hadoop Configuration property so that you can call * {@link DCUtils#loadSerializedObjectInDC(Configuration, Class, String, boolean)} to re-instantiate the serialized instance. * * @param obj The obj instance to serialize using Java serialization. * @param serializeToLocalFile The local file where the instance will be serialized. It will be copied to the HDFS and removed. * @param conf The Hadoop Configuration. * @throws FileNotFoundException * @throws IOException * @throws URISyntaxException */ public static void serializeToDC(Object obj, String serializeToLocalFile, Configuration conf) throws FileNotFoundException, IOException, URISyntaxException { File hadoopTmpDir = new File(conf.get("hadoop.tmp.dir")); if (!hadoopTmpDir.exists()) { hadoopTmpDir.mkdir(); } File file = new File(hadoopTmpDir, serializeToLocalFile); FileSystem fS = FileSystem.get(conf); ObjectOutput out = new ObjectOutputStream(new FileOutputStream(file)); out.writeObject(obj); out.close(); if (fS.equals(FileSystem.getLocal(conf))) { return; } String tmpHdfsFolder = conf.get(HDFS_TMP_FOLDER_CONF); if (tmpHdfsFolder == null) { // set the temporary folder for Pangool instances to the temporary of the user that is running the Job // This folder will be used across the cluster for location the instances. This way, tasktrackers // that are being run as different user will still be able to locate this folder tmpHdfsFolder = conf.get("hadoop.tmp.dir"); conf.set(HDFS_TMP_FOLDER_CONF, tmpHdfsFolder); } Path toHdfs = new Path(tmpHdfsFolder, serializeToLocalFile); if (fS.exists(toHdfs)) { // Optionally, copy to DFS if fS.delete(toHdfs, false); } FileUtil.copy(FileSystem.getLocal(conf), new Path(file + ""), FileSystem.get(conf), toHdfs, true, conf); DistributedCache.addCacheFile(toHdfs.toUri(), conf); }
From source file:com.datasalt.pangool.utils.InstancesDistributor.java
License:Apache License
/** * Utility method for serializing an object and saving it in a way that later can be recovered * anywhere in the cluster./*from w w w . jav a 2s.com*/ * <p> * The file where it has been serialized will be saved into a Hadoop Configuration property so that you can call * {@link InstancesDistributor#loadInstance(Configuration, Class, String, boolean)} to re-instantiate the serialized instance. * * @param obj The obj instance to serialize using Java serialization. * @param fileName The file name where the instance will be serialized. * @param conf The Hadoop Configuration. * @throws FileNotFoundException * @throws IOException * @throws URISyntaxException */ public static void distribute(Object obj, String fileName, Configuration conf) throws FileNotFoundException, IOException, URISyntaxException { FileSystem fS = FileSystem.get(conf); // set the temporary folder for Pangool instances to the temporary of the user that is running the Job // This folder will be used across the cluster for location the instances. // The default value can be changed by a user-provided one. String tmpHdfsFolder = conf.get(HDFS_TMP_FOLDER_CONF, DEFAULT_HDFS_TMP_FOLDER_CONF_VALUE); Path toHdfs = new Path(tmpHdfsFolder, fileName); if (fS.exists(toHdfs)) { // Optionally, copy to DFS if fS.delete(toHdfs, false); } ObjectOutput out = new ObjectOutputStream(fS.create(toHdfs)); out.writeObject(obj); out.close(); DistributedCache.addCacheFile(toHdfs.toUri(), conf); }