List of usage examples for org.apache.hadoop.fs FileSystem getConf
@Override
public Configuration getConf()
From source file:org.pentaho.hadoop.shim.common.DistributedCacheUtilImpl.java
License:Apache License
/** * Stages the source file or folder to a Hadoop file system and sets their permission and replication value * appropriately to be used with the Distributed Cache. WARNING: This will delete the contents of dest before staging * the archive./* w ww. ja va2 s . c om*/ * * @param source File or folder to copy to the file system. If it is a folder all contents will be copied into * dest. * @param fs Hadoop file system to store the contents of the archive in * @param dest Destination to copy source into. If source is a file, the new file name will be exactly dest. If * source is a folder its contents will be copied into dest. For more info see {@link * FileSystem#copyFromLocalFile(org.apache.hadoop.fs.Path, org.apache.hadoop.fs.Path)}. * @param overwrite Should an existing file or folder be overwritten? If not an exception will be thrown. * @throws IOException Destination exists is not a directory * @throws KettleFileException Source does not exist or destination exists and overwrite is false. */ public void stageForCache(FileObject source, FileSystem fs, Path dest, boolean overwrite, boolean isPublic) throws IOException, KettleFileException { if (!source.exists()) { throw new KettleFileException(BaseMessages.getString(DistributedCacheUtilImpl.class, "DistributedCacheUtil.SourceDoesNotExist", source)); } if (fs.exists(dest)) { if (overwrite) { // It is a directory, clear it out fs.delete(dest, true); } else { throw new KettleFileException(BaseMessages.getString(DistributedCacheUtilImpl.class, "DistributedCacheUtil.DestinationExists", dest.toUri().getPath())); } } // Use the same replication we'd use for submitting jobs short replication = (short) fs.getConf().getInt("mapred.submit.replication", 10); if (source.getURL().toString().endsWith(CONFIG_PROPERTIES)) { copyConfigProperties(source, fs, dest); } else { Path local = new Path(source.getURL().getPath()); fs.copyFromLocalFile(local, dest); } if (isPublic) { fs.setPermission(dest, PUBLIC_CACHED_FILE_PERMISSION); } else { fs.setPermission(dest, CACHED_FILE_PERMISSION); } fs.setReplication(dest, replication); }
From source file:org.pentaho.hadoop.shim.hsp101.HadoopShim.java
License:Apache License
@Override public void onLoad(HadoopConfiguration config, HadoopConfigurationFileSystemManager fsm) throws Exception { fsm.addProvider(config, "hdfs", config.getIdentifier(), new HDFSFileProvider()); setDistributedCacheUtil(new DistributedCacheUtilImpl(config) { /**/*ww w . j a v a 2 s . co m*/ * Default permission for cached files * <p/> * Not using FsPermission.createImmutable due to EOFExceptions when using it with Hadoop 0.20.2 */ private final FsPermission CACHED_FILE_PERMISSION = new FsPermission((short) 0755); public void addFileToClassPath(Path file, Configuration conf) throws IOException { String classpath = conf.get("mapred.job.classpath.files"); conf.set("mapred.job.classpath.files", classpath == null ? file.toString() : classpath + getClusterPathSeparator() + file.toString()); FileSystem fs = FileSystem.get(conf); URI uri = fs.makeQualified(file).toUri(); DistributedCache.addCacheFile(uri, conf); } /** * Stages the source file or folder to a Hadoop file system and sets their permission and replication * value appropriately to be used with the Distributed Cache. WARNING: This will delete the contents of * dest before staging the archive. * * @param source File or folder to copy to the file system. If it is a folder all contents will be * copied into dest. * @param fs Hadoop file system to store the contents of the archive in * @param dest Destination to copy source into. If source is a file, the new file name will be * exactly dest. If source is a folder its contents will be copied into dest. For more * info see {@link FileSystem#copyFromLocalFile(org.apache.hadoop.fs.Path, * org.apache.hadoop.fs.Path)}. * @param overwrite Should an existing file or folder be overwritten? If not an exception will be * thrown. * @throws IOException Destination exists is not a directory * @throws KettleFileException Source does not exist or destination exists and overwrite is false. */ public void stageForCache(FileObject source, FileSystem fs, Path dest, boolean overwrite) throws IOException, KettleFileException { if (!source.exists()) { throw new KettleFileException(BaseMessages.getString(DistributedCacheUtilImpl.class, "DistributedCacheUtil.SourceDoesNotExist", source)); } if (fs.exists(dest)) { if (overwrite) { // It is a directory, clear it out fs.delete(dest, true); } else { throw new KettleFileException(BaseMessages.getString(DistributedCacheUtilImpl.class, "DistributedCacheUtil.DestinationExists", dest.toUri().getPath())); } } // Use the same replication we'd use for submitting jobs short replication = (short) fs.getConf().getInt("mapred.submit.replication", 10); copyFile(source, fs, dest, overwrite); fs.setReplication(dest, replication); } private void copyFile(FileObject source, FileSystem fs, Path dest, boolean overwrite) throws IOException { if (source.getType() == FileType.FOLDER) { fs.mkdirs(dest); fs.setPermission(dest, CACHED_FILE_PERMISSION); for (FileObject fileObject : source.getChildren()) { copyFile(fileObject, fs, new Path(dest, fileObject.getName().getBaseName()), overwrite); } } else { try (FSDataOutputStream fsDataOutputStream = fs.create(dest, overwrite)) { IOUtils.copy(source.getContent().getInputStream(), fsDataOutputStream); fs.setPermission(dest, CACHED_FILE_PERMISSION); } } } public String getClusterPathSeparator() { return System.getProperty("hadoop.cluster.path.separator", ","); } }); }
From source file:org.springframework.data.hadoop.fs.FsShell.java
License:Apache License
private void copyToLocal(final FileSystem srcFS, final Path src, final File dst, final boolean copyCrc) throws IOException { final String COPYTOLOCAL_PREFIX = "_copyToLocal_"; /* Keep the structure similar to ChecksumFileSystem.copyToLocal(). * Ideal these two should just invoke FileUtil.copy() and not repeat * recursion here. Of course, copy() should support two more options : * copyCrc and useTmpFile (may be useTmpFile need not be an option). *///from w w w . ja v a 2 s . c o m if (!srcFS.getFileStatus(src).isDir()) { if (dst.exists()) { // match the error message in FileUtil.checkDest(): throw new IOException("Target " + dst + " already exists"); } // use absolute name so that tmp file is always created under dest dir File tmp = FileUtil.createLocalTempFile(dst.getAbsoluteFile(), COPYTOLOCAL_PREFIX, true); if (!FileUtil.copy(srcFS, src, tmp, false, srcFS.getConf())) { throw new IOException("Failed to copy " + src + " to " + dst); } if (!tmp.renameTo(dst)) { throw new IOException( "Failed to rename tmp file " + tmp + " to local destination \"" + dst + "\"."); } if (copyCrc) { if (!(srcFS instanceof ChecksumFileSystem)) { throw new IOException("Source file system does not have crc files"); } ChecksumFileSystem csfs = (ChecksumFileSystem) srcFS; File dstcs = FileSystem.getLocal(srcFS.getConf()) .pathToFile(csfs.getChecksumFile(new Path(dst.getCanonicalPath()))); copyToLocal(csfs.getRawFileSystem(), csfs.getChecksumFile(src), dstcs, false); } } else { // once FileUtil.copy() supports tmp file, we don't need to mkdirs(). dst.mkdirs(); for (FileStatus path : srcFS.listStatus(src)) { copyToLocal(srcFS, path.getPath(), new File(dst, path.getPath().getName()), copyCrc); } } }
From source file:org.springframework.data.hadoop.fs.HdfsResourceLoader.java
License:Apache License
/** * Constructs a new <code>HdfsResourceLoader</code> instance. * * @param fs Hadoop file system to use./* w ww . jav a 2 s. c o m*/ */ public HdfsResourceLoader(FileSystem fs) { Assert.notNull(fs, "a non-null file-system required"); this.fs = fs; internalFS = false; codecsFactory = new CompressionCodecFactory(fs.getConf()); }
From source file:org.talend.components.simplefileio.runtime.sinks.ParquetHdfsFileSink.java
License:Open Source License
@Override protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) { try {//from w ww .ja v a2 s . com FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder); List<Path> sourceFiles = new ArrayList<>(); for (FileStatus sourceStatus : sourceStatuses) { sourceFiles.add(sourceStatus.getPath()); } FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()) .getFileMetaData(); ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile), ParquetFileWriter.Mode.CREATE); writer.start(); for (Path input : sourceFiles) { writer.appendFile(fs.getConf(), input); } writer.end(mergedMeta.getKeyValueMetaData()); } catch (Exception e) { LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage()); return false; } return true; }
From source file:org.talend.components.simplefileio.runtime.sinks.UgiFileSinkBase.java
License:Open Source License
protected boolean mergeOutput(FileSystem fs, String sourceFolder, String targetFile) { // implement how to merge files, different between format try {//from w ww . j a v a 2 s . c o m return FileUtil.copyMerge(fs, new Path(sourceFolder), fs, new Path(targetFile), false, fs.getConf(), ""); } catch (Exception e) { LOG.error("Error when merging files in {}.\n{}", sourceFolder, e.getMessage()); return false; } }
From source file:org.talend.components.test.MiniDfsResource.java
License:Open Source License
/** * Tests that a file on the HDFS cluster contains the given parquet. * * @param path the name of the file on the HDFS cluster * @param expected the expected avro record in the file . *//*from w w w . j a va2s .c o m*/ public static void assertReadParquetFile(FileSystem fs, String path, Set<IndexedRecord> expected, boolean part) throws IOException { Path p = new Path(path); if (fs.isFile(p)) { try (AvroParquetReader<GenericRecord> reader = new AvroParquetReader<GenericRecord>(fs.getConf(), new Path(path))) { IndexedRecord record = null; while (null != (record = reader.read())) { IndexedRecord eqRecord = null; for (IndexedRecord indexedRecord : expected) { if (indexedRecord.equals(record)) { eqRecord = indexedRecord; break; } } expected.remove(eqRecord); } } // Check before asserting for the message. if (!part && expected.size() != 0) assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0)); } else if (fs.isDirectory(p)) { for (FileStatus fstatus : FileSystemUtil.listSubFiles(fs, p)) { assertReadParquetFile(fs, fstatus.getPath().toString(), expected, true); } // Check before asserting for the message. if (expected.size() != 0) assertThat("Not all avro records found: " + expected.iterator().next(), expected, hasSize(0)); } else { fail("No such path: " + path); } }
From source file:sa.edu.kaust.fwindex.IntDocVectorsForwardIndex.java
License:Apache License
/** * Creates an <code>IntDocVectorsIndex</code> object. * /* ww w . j a v a 2 s . co m*/ * @param indexPath * location of the index file * @param fs * handle to the FileSystem * @throws IOException */ public IntDocVectorsForwardIndex(String origIndexPath, String fwindexPath, FileSystem fs) throws IOException { mFs = fs; mConf = fs.getConf(); mOrigIndexPath = origIndexPath; sLogger.debug("mPath: " + mOrigIndexPath); String forwardIndexPath = fwindexPath; sLogger.debug("forwardIndexPath: " + forwardIndexPath); FSDataInputStream posInput = fs.open(new Path(forwardIndexPath)); mCount = 0; mPositions = new Hashtable<String, Long>(); while (true) { //Terms are retireved until end of file is reached. try { //The two values (term and position) are written in a single string (see //BuildIntDocVectorsForwardIndex.MyReducer.reduce). Here they are retrieved. String[] inp = posInput.readUTF().split("\t"); String k = inp[0]; long l = Long.parseLong(inp[1]); mPositions.put(k, l); } catch (Exception e) { break; } mCount++; } sLogger.info("mCount: " + mCount); }
From source file:sa.edu.kaust.twitter.index.PostingsForwardIndex.java
License:Apache License
/** * Creates an <code>IntDocVectorsIndex</code> object. * //from w w w .j a va 2s . com * @param indexPath * location of the index file * @param fs * handle to the FileSystem * @throws IOException * @throws IOException */ public PostingsForwardIndex(String origIndexPath, String fwindexPath, FileSystem fs) throws IOException { mFs = fs; mConf = fs.getConf(); mOrigIndexPath = origIndexPath; sLogger.debug("mPath: " + mOrigIndexPath); String forwardIndexPath = fwindexPath; sLogger.debug("forwardIndexPath: " + forwardIndexPath); FSDataInputStream posInput = fs.open(new Path(forwardIndexPath)); //mCount = posInput.readInt(); //mPositions = new long[mCount]; map = new HMapKL<String>(); String term; long pos; int i = 0; System.out.println("Loading postings forward index ..."); while (true) { try { term = posInput.readUTF(); pos = posInput.readLong(); map.put(term, pos); } catch (IOException e) { break; } i++; if (i % 1000000 == 0) System.out.println("loaded " + i + " entries ..."); } System.out.println("done."); }
From source file:sa.edu.kaust.twitter.index.TweetsForwardIndex.java
License:Apache License
/** * Creates an <code>IntDocVectorsIndex</code> object. * /*from w ww .j a v a2 s . c o m*/ * @param indexPath * location of the index file * @param fs * handle to the FileSystem * @throws IOException * @throws IOException */ public TweetsForwardIndex(String origIndexPath, String fwindexPath, FileSystem fs) throws IOException { mFs = fs; mConf = fs.getConf(); mOrigIndexPath = origIndexPath; sLogger.debug("mPath: " + mOrigIndexPath); String forwardIndexPath = fwindexPath; sLogger.debug("forwardIndexPath: " + forwardIndexPath); FSDataInputStream posInput = fs.open(new Path(forwardIndexPath)); //mCount = posInput.readInt(); //mPositions = new long[mCount]; map = new HMapKL<Long>(); long tweetID; long pos; int i = 0; System.out.println("Loading tweets forward index ..."); while (true) { try { tweetID = posInput.readLong(); pos = posInput.readLong(); map.put(tweetID, pos); } catch (IOException e) { break; } i++; if (i % 1000000 == 0) { //if(i % 10 == 0){ //System.out.println(tweetID+"\t"+pos); System.out.println("loaded " + i + " entries ..."); } } System.out.println("done (" + i + " entries)."); }