List of usage examples for org.apache.hadoop.io SequenceFile createWriter
@Deprecated public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass, CompressionType compressionType, CompressionCodec codec) throws IOException
From source file:org.commoncrawl.service.listcrawler.HDFSFlusherThread.java
License:Open Source License
@Override public void run() { boolean shutdown = false; while (!shutdown) { try {/* w ww .j av a 2 s . c o m*/ final CacheFlushRequest request = _manager.getHDFSFlushRequestQueue().take(); switch (request._requestType) { case ExitThreadRequest: { // shutdown condition ... CacheManager.LOG.info("Cache Flusher Thread Received Shutdown. Exiting!"); shutdown = true; } break; case FlushRequest: { LOG.info("Received Flush Request"); ArrayList<IndexDataFileTriple> tempFiles = new ArrayList<IndexDataFileTriple>(); ArrayList<FingerprintAndOffsetTuple> tuplesOut = new ArrayList<FingerprintAndOffsetTuple>(); // flag to track request status at end .. boolean requestFailed = false; long logStart = LocalLogFileHeader.SIZE; long logEnd = logStart + request._bytesToFlush; // create a hdfs temp file for data (and index) long generateTime = System.currentTimeMillis(); Path tempDir = new Path(CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + "/flusher-temp-" + generateTime); // mkdir ... try { _manager.getRemoteFileSystem().mkdirs(tempDir); } catch (IOException e1) { LOG.error(CCStringUtils.stringifyException(e1)); requestFailed = true; } int iterationNumber = 0; while (logStart != logEnd && !requestFailed) { Path tempDataFile = new Path(tempDir, CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + iterationNumber); Path tempIndexFile = new Path(tempDir, CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + iterationNumber); LOG.info("FlushRequest Pass#:" + iterationNumber + " DataPath:" + tempDataFile + " IndexPath:" + tempIndexFile); SequenceFile.Writer writer = null; FSDataOutputStream indexOutputStream = null; RandomAccessFile localLogFile = null; try { LOG.info("Pass#:" + iterationNumber + " Opening SequenceFile for Output"); // open a temporary hdfs streams ... writer = SequenceFile.createWriter(_manager.getRemoteFileSystem(), CrawlEnvironment.getHadoopConfig(), tempDataFile, Text.class, CacheItem.class, CompressionType.NONE); // opening index output stream ... LOG.info("Pass#:" + iterationNumber + " Opening Index Output Stream"); indexOutputStream = _manager.getRemoteFileSystem().create(tempIndexFile); LOG.info("Pass#:" + iterationNumber + " Opening Local Log"); localLogFile = new RandomAccessFile(_manager.getActiveLogFilePath(), "rw"); // transfer log entries and generate index logStart = generateSequenceFileAndIndex(_manager.getCacheFlushThreshold(), localLogFile, logStart, logEnd, _manager.getLocalLogSyncBytes(), writer, indexOutputStream, tuplesOut); } catch (IOException e) { CacheManager.LOG.error(CCStringUtils.stringifyException(e)); requestFailed = true; } finally { if (writer != null) { try { writer.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } if (indexOutputStream != null) { try { indexOutputStream.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } if (localLogFile != null) { try { localLogFile.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } } if (requestFailed) { try { LOG.info("Pass#:" + iterationNumber + " Failed. Deleting temp files"); _manager.getRemoteFileSystem().delete(tempDataFile, false); _manager.getRemoteFileSystem().delete(tempIndexFile, false); } catch (IOException e) { LOG.error("Delete Failed During Failure! Potenital Orphan Files! : " + CCStringUtils.stringifyException(e)); } break; } else { LOG.info("Pass#:" + iterationNumber + " Finished. Adding files to tuple list"); // add temp file tuple IndexDataFileTriple indexDataPair = new IndexDataFileTriple(); indexDataPair._dataFilePath = tempDataFile; indexDataPair._indexFilePath = tempIndexFile; tempFiles.add(indexDataPair); } iterationNumber++; } LOG.info("All Passes Complete. Finalizing Commit"); // ok if request failed ... if (!requestFailed) { int itemIndex = 0; for (IndexDataFileTriple indexDataPair : tempFiles) { // generate final paths ... Path finalOutputDir = _manager.getRemoteDataDirectory(); Path finalDataFilePath = new Path(finalOutputDir, CacheManager.PROXY_CACHE_FILE_DATA_PREFIX + "-" + (generateTime + itemIndex)); Path finalIndexFilePath = new Path(finalOutputDir, CacheManager.PROXY_CACHE_FILE_INDEX_PREFIX + "-" + (generateTime + itemIndex)); try { LOG.info("Pass#:" + itemIndex + " Renaming Temp Files"); LOG.info("Pass#:" + itemIndex + " Final Data File Name is:" + finalDataFilePath); LOG.info("Pass#:" + itemIndex + " Final Index File Name is:" + finalIndexFilePath); // rename files ... _manager.getRemoteFileSystem().rename(indexDataPair._dataFilePath, finalDataFilePath); indexDataPair._dataFilePath = finalDataFilePath; _manager.getRemoteFileSystem().rename(indexDataPair._indexFilePath, finalIndexFilePath); indexDataPair._indexFilePath = finalIndexFilePath; } catch (IOException e) { LOG.info("Pass#:" + itemIndex + " Rename Failed"); LOG.error(CCStringUtils.stringifyException(e)); requestFailed = true; break; } try { // copy to local ... indexDataPair._localIndexFilePath = new File(_manager.getLocalDataDirectory(), finalIndexFilePath.getName()); LOG.info("Pass#:" + itemIndex + " Copying Remote Index File at:" + finalIndexFilePath + " to Local Directory:" + indexDataPair._localIndexFilePath.getAbsolutePath()); _manager.getRemoteFileSystem().copyToLocalFile(finalIndexFilePath, new Path(indexDataPair._localIndexFilePath.getAbsolutePath())); LOG.info("Pass#:" + itemIndex + " Done Copying Remote Index File to Local"); } catch (IOException e) { LOG.info("Pass#:" + itemIndex + " Local File Copy Failed with Exception:" + CCStringUtils.stringifyException(e)); requestFailed = true; indexDataPair._localIndexFilePath = null; break; } // inrement item index itemIndex++; } // ok callback to manager if request succeeded if (!requestFailed) { try { LOG.info("Flush Complete. Calling hdfsFlushComplete"); _manager.hdfsCacheFlushRequestComplete(request, tuplesOut, tempFiles); LOG.info("Flush Complete. hdfsFlushComplete succeeded"); } catch (IOException e) { LOG.error("hdfsFlushComplete returned Exception:" + CCStringUtils.stringifyException(e)); requestFailed = true; } } } if (requestFailed) { LOG.info("Cache Manager Log Flush Failed. Deleteing files"); try { // delete temp file directory recursively _manager.getRemoteFileSystem().delete(tempDir, true); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } // iterate temp file list for (IndexDataFileTriple triple : tempFiles) { try { LOG.info("Deleteing:" + triple._dataFilePath); _manager.getRemoteFileSystem().delete(triple._dataFilePath, false); LOG.info("Deleteing:" + triple._indexFilePath); _manager.getRemoteFileSystem().delete(triple._indexFilePath, false); if (triple._localIndexFilePath != null) { LOG.info("Deleteing LOCAL:" + triple._localIndexFilePath); triple._localIndexFilePath.delete(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } // callback to manager with the bad news ... _manager.hdfsCacheFlushRequestFailed(request); } } break; } } catch (InterruptedException e) { LOG.error("Unexpected Exception in HDFSFlusher Thread:" + CCStringUtils.stringifyException(e)); } } }
From source file:org.commoncrawl.util.MapReduceJobStatsWriter.java
License:Open Source License
/** Constructor * /*from ww w . jav a 2 s.co m*/ * @param keyClass key type * @param valueClass value type * @param familyKey * @param groupingKey * @param uniqueKey */ public MapReduceJobStatsWriter(FileSystem remoteFileSystem, Configuration config, Class<KeyType> keyClass, Class<ValueType> valueClass, String familyKey, String groupingKey, long uniqueKey) throws IOException { _logFamily = familyKey; _groupingKey = groupingKey; _uniqueKey = uniqueKey; _remoteFileSystem = remoteFileSystem; _config = config; // temp file _tempFileName = File.createTempFile("statsWriter", "seq"); // create output stream that sequence file writer will output to _outputStream = FileSystem.getLocal(_config).create(new Path(_tempFileName.getAbsolutePath())); LzoCodec codec = new LzoCodec(); // create sequencefile writer _writer = SequenceFile.createWriter(config, _outputStream, keyClass, valueClass, CompressionType.BLOCK, codec); // start event loop _eventLoop.start(); }
From source file:org.honu.datacollection.writer.localfs.LockFreeWriter.java
License:Apache License
protected void rotate() { Tracer t = Tracer.startNewTracer("honu.server." + group + ".rotateDataSink"); isRunning = true;//w w w. j a v a2 s. c o m calendar.setTimeInMillis(System.currentTimeMillis()); log.info(group + "- start Date [" + calendar.getTime() + "]"); log.info(group + "- Rotate from " + Thread.currentThread().getName()); String newName = day.format(calendar.getTime()); newName += localHostAddr + new java.rmi.server.UID().toString(); newName = newName.replace("-", ""); newName = newName.replace(":", ""); // newName = newName.replace(".", ""); newName = localOutputDir + "/" + newName.trim(); try { FSDataOutputStream previousOutputStr = currentOutputStr; Path previousPath = currentPath; String previousFileName = currentFileName; if (previousOutputStr != null) { seqFileWriter.close(); previousOutputStr.close(); if (chunksWrittenThisRotate) { fs.rename(previousPath, new Path(previousFileName + ".done")); fileQueue.add(previousFileName + ".done"); } else { log.info(group + "- no chunks written to " + previousPath + ", deleting"); fs.delete(previousPath, false); } } Path newOutputPath = new Path(newName + ".chukwa"); FSDataOutputStream newOutputStr = fs.create(newOutputPath); currentOutputStr = newOutputStr; currentPath = newOutputPath; currentFileName = newName; chunksWrittenThisRotate = false; if (codec != null) { seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.BLOCK, codec); } else { seqFileWriter = SequenceFile.createWriter(conf, newOutputStr, ChukwaArchiveKey.class, ChunkImpl.class, SequenceFile.CompressionType.NONE, codec); } } catch (Throwable e) { if (t != null) { t.stopAndLogTracer(); } log.fatal(group + "- Throwable Exception in rotate. Exiting!", e); // Shutting down the collector // Watchdog will re-start it automatically DaemonWatcher.bailout(-1); } // Check for disk space File directory4Space = new File(localOutputDir); long totalSpace = directory4Space.getTotalSpace(); long freeSpace = directory4Space.getFreeSpace(); long minFreeAvailable = (totalSpace * minPercentFreeDisk) / 100; if (log.isDebugEnabled()) { log.debug(group + "- Directory: " + localOutputDir + ", totalSpace: " + totalSpace + ", freeSpace: " + freeSpace + ", minFreeAvailable: " + minFreeAvailable + ", percentFreeDisk: " + minPercentFreeDisk); } if (freeSpace < minFreeAvailable) { log.fatal(group + "- No space left on device, Bail out!"); DaemonWatcher.bailout(-1); } nextRotate = System.currentTimeMillis() + rotateInterval; if (t != null) { t.stopAndLogTracer(); } }
From source file:org.jd.copier.mapred.DistCp.java
License:Apache License
/** * Initialize DFSCopyFileMapper specific job-configuration. * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments/* w ww .java 2 s . c o m*/ * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); //set boolean values final boolean update = args.flags.contains(Options.UPDATE); final boolean skipCRCCheck = args.flags.contains(Options.SKIPCRC); final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE); jobConf.setBoolean(Options.UPDATE.propertyname, update); jobConf.setBoolean(Options.SKIPCRC.propertyname, skipCRCCheck); jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite); jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname, args.flags.contains(Options.IGNORE_READ_FAILURES)); jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(jClient.getFs(), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_distcp_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (null == parent) { // If dst is '/' on S3, it might not exist yet, but dst.getParent() // will return null. In this case, use '/' as its own parent to prevent // NPE errors below. parent = args.dst; } if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_distcp_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory OR we're updating/overwriting // the contents of the destination directory. final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite; int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { //skip file if the src and the dst files are the same. skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst), skipCRCCheck); //skip file if it exceed file limit or size limit skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit; if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_distcp_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); if (dststatus != null && args.flags.contains(Options.DELETE)) { deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf); } Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_distcp_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); // Explicitly create the tmpDir to ensure that it can be cleaned // up by fullyDelete() later. tmpDir.getFileSystem(conf).mkdirs(tmpDir); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToCopyCount=" + fileCount); LOG.info("bytesToCopyCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(byteCount, jobConf); return fileCount > 0; }
From source file:org.jd.copier.mapred.DistCp.java
License:Apache License
/** Delete the dst files/dirs which do not exist in src */ static private void deleteNonexisting(FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs, Path jobdir, JobConf jobconf, Configuration conf) throws IOException { if (!dstroot.isDir()) { throw new IOException("dst must be a directory when option " + Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath() + ") is not a directory."); }//from www . j a va2 s . c om //write dst lsr results final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr"); final SequenceFile.Writer writer = SequenceFile.createWriter(jobfs, jobconf, dstlsr, Text.class, dstroot.getClass(), SequenceFile.CompressionType.NONE); try { //do lsr to get all file statuses in dstroot final Stack<FileStatus> lsrstack = new Stack<FileStatus>(); for (lsrstack.push(dstroot); !lsrstack.isEmpty();) { final FileStatus status = lsrstack.pop(); if (status.isDir()) { for (FileStatus child : dstfs.listStatus(status.getPath())) { String relative = makeRelative(dstroot.getPath(), child.getPath()); writer.append(new Text(relative), child); lsrstack.push(child); } } } } finally { checkAndClose(writer); } //sort lsr results final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted"); SequenceFile.Sorter sorter = new SequenceFile.Sorter(jobfs, new Text.Comparator(), Text.class, FileStatus.class, jobconf); sorter.sort(dstlsr, sortedlsr); //compare lsr list and dst list SequenceFile.Reader lsrin = null; SequenceFile.Reader dstin = null; try { lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf); dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf); //compare sorted lsr list and sorted dst list final Text lsrpath = new Text(); final FileStatus lsrstatus = new FileStatus(); final Text dstpath = new Text(); final Text dstfrom = new Text(); final FsShell shell = new FsShell(conf); final String[] shellargs = { "-rmr", null }; boolean hasnext = dstin.next(dstpath, dstfrom); for (; lsrin.next(lsrpath, lsrstatus);) { int dst_cmp_lsr = dstpath.compareTo(lsrpath); for (; hasnext && dst_cmp_lsr < 0;) { hasnext = dstin.next(dstpath, dstfrom); dst_cmp_lsr = dstpath.compareTo(lsrpath); } if (dst_cmp_lsr == 0) { //lsrpath exists in dst, skip it hasnext = dstin.next(dstpath, dstfrom); } else { //lsrpath does not exist, delete it String s = new Path(dstroot.getPath(), lsrpath.toString()).toString(); if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) { shellargs[1] = s; int r = 0; try { r = shell.run(shellargs); } catch (Exception e) { throw new IOException("Exception from shell.", e); } if (r != 0) { throw new IOException( "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r); } } } } } finally { checkAndClose(lsrin); checkAndClose(dstin); } }
From source file:org.mrgeo.vector.mrsvector.OSMTileIngester.java
License:Apache License
static Writer openWriter(final Configuration conf, final Path path, final String subdir, final Class<?> key, final Class<?> value) throws IOException { final FileSystem fs = HadoopFileUtils.getFileSystem(path); final String name = HadoopUtils.createRandomString(10); final Path stringsPath = new Path(path, subdir + "/" + name); return SequenceFile.createWriter(fs, conf, stringsPath, key, value, SequenceFile.CompressionType.RECORD); }
From source file:org.oclc.firefly.hadoop.backup.Backup.java
License:Apache License
/** * Create mapper input files containing their paths to copy * @param mapperInput The list of files that the copy mappers should copy * @param numMapTasks The number of map tasks * @param fs The file system to write to * @param id The mapper id//from w w w . j av a2s . com * @return The list of input files for a a mapper * @throws IOException If we fail to create input files */ private List<Path> createMapperInputSequenceFiles(List<Pair<String, HRegionInfo>> mapperInput, int numMapTasks, FileSystem fs, int id) throws IOException { int idx = 0; List<Path> paths = new ArrayList<Path>(); List<SequenceFile.Writer> writers = new ArrayList<SequenceFile.Writer>(); String inputDir = getMapInputDirectory(id); // delete this directory if already exists fs.delete(new Path(inputDir), true); // each mapper gets an input file for (int i = 0; i < numMapTasks; i++) { // open the input file for writing Path mapInputFile = new Path(inputDir + "/mapper-input-" + i + ".txt"); fs.delete(mapInputFile, false); SequenceFile.Writer writer = SequenceFile.createWriter(fs, fs.getConf(), mapInputFile, Text.class, HRegionInfo.class, SequenceFile.CompressionType.NONE); LOG.debug("Mapper input: " + mapInputFile); paths.add(mapInputFile); writers.add(writer); } // Assign copy paths to mappers for (Pair<String, HRegionInfo> pair : mapperInput) { Text key = new Text(pair.getFirst()); HRegionInfo value = new HRegionInfo(pair.getSecond()); LOG.debug("Appending " + key + ", " + value.getEncodedName()); writers.get(idx).append(key, value); idx++; if (idx >= writers.size()) { idx = 0; } } // close writers for (SequenceFile.Writer writer : writers) { try { writer.sync(); writer.close(); } catch (Exception e) { // nothing to do here } } return paths; }
From source file:org.qcri.pca.FileFormat.java
public static void convertFromDenseToSeq(String inputPath, int cardinality, String outputFolderPath) { try {/* www . jav a 2s. c om*/ final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); int lineNumber = 0; String thisLine; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); Vector vector = null; String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here if (thisLine.isEmpty()) continue; String[] splitted = thisLine.split("\\s+"); vector = new SequentialAccessSparseVector(splitted.length); for (int i = 0; i < splitted.length; i++) { vector.set(i, Double.parseDouble(splitted[i])); } key.set(lineNumber); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row lineNumber++; } writer.close(); } } catch (Exception e) { e.printStackTrace(); } }
From source file:org.qcri.pca.FileFormat.java
public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) { try {//from ww w . j av a 2 s. c o m final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = null; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); Vector vector = null; String thisLine; int prevRowID = -1; boolean first = true; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here String[] splitted = thisLine.split(","); int rowID = Integer.parseInt(splitted[0]); int colID = Integer.parseInt(splitted[1]); double element = Double.parseDouble(splitted[2]); if (first) { first = false; vector = new SequentialAccessSparseVector(cardinality); } else if (rowID != prevRowID) { key.set(prevRowID); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row vector = new SequentialAccessSparseVector(cardinality); } prevRowID = rowID; vector.set(colID - base, element); } /*//here we append the last vector in each file (assuming that we will start a new row in the next file key.set(prevRowID); value.set(vector); //System.out.println("last vector"); //System.out.println(vector); writer.append(key,value);//write last row writer.close(); */ } if (writer != null) //append last vector in last file { key.set(prevRowID); value.set(vector); //System.out.println("last vector"); //System.out.println(vector); writer.append(key, value);//write last row writer.close(); } } catch (Exception e) { e.printStackTrace(); } }
From source file:org.qcri.sparkpca.FileFormat.java
public static void convertFromCooToSeq(String inputPath, int cardinality, int base, String outputFolderPath) { try {/* w w w . ja v a 2 s. co m*/ final Configuration conf = new Configuration(); final FileSystem fs = FileSystem.get(conf); SequenceFile.Writer writer = null; final IntWritable key = new IntWritable(); final VectorWritable value = new VectorWritable(); Vector vector = null; String thisLine; int lineNumber = 0; int prevRowID = -1; boolean first = true; File[] filePathList = null; File inputFile = new File(inputPath); if (inputFile.isFile()) // if it is a file { filePathList = new File[1]; filePathList[0] = inputFile; } else { filePathList = inputFile.listFiles(); } if (filePathList == null) { log.error("The path " + inputPath + " does not exist"); return; } for (File file : filePathList) { BufferedReader br = new BufferedReader(new FileReader(file)); String outputFileName = outputFolderPath + File.separator + file.getName() + ".seq"; writer = SequenceFile.createWriter(fs, conf, new Path(outputFileName), IntWritable.class, VectorWritable.class, CompressionType.BLOCK); while ((thisLine = br.readLine()) != null) { // while loop begins here String[] splitted = thisLine.split(","); int rowID = Integer.parseInt(splitted[0]); int colID = Integer.parseInt(splitted[1]); double element = Double.parseDouble(splitted[2]); if (first) { first = false; vector = new SequentialAccessSparseVector(cardinality); } else if (rowID != prevRowID) { key.set(prevRowID); value.set(vector); //System.out.println(vector); writer.append(key, value);//write last row vector = new SequentialAccessSparseVector(cardinality); } prevRowID = rowID; vector.set(colID - base, element); } } if (writer != null) //append last vector in last file { key.set(prevRowID); value.set(vector); writer.append(key, value);//write last row writer.close(); } } catch (Exception e) { e.printStackTrace(); } }