Java tutorial
/* * Copyright 2012 The SCAPE Project Consortium. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * under the License. */ package eu.scape_project.tb.lsdr.seqfileutility; import eu.scape_project.tb.lsdr.seqfileutility.util.FileUtils; import java.io.*; import java.net.URI; import org.apache.commons.io.FilenameUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Utility methods for writing hadoop sequence files. * * @author Sven Schlarb https://github.com/shsdev * @version 0.1 */ public final class SequenceFileWriter implements Runnable, Serializable { private Path path; private SequenceFile.Writer writer; private Configuration conf; ProcessParameters pc; // Logger instance private static Logger logger = LoggerFactory.getLogger(SequenceFileWriter.class.getName()); //int lineCounter; private long filecount; private long linecount; String uri; File rootDir; File sequenceFile; private static int BUFFER_SIZE = 1024; public String getSequenceFileName() { return pc.getThreadSeqFile(); } /** * Private constructur initialising the hadoop configuration. */ private SequenceFileWriter() { conf = new Configuration(); filecount = 0L; linecount = 0L; } /** * Constructur that takes the sequence file name as an argument. * * @param sequenceFileName Sequence file name */ public SequenceFileWriter(ProcessParameters pc) { this(); this.pc = pc; initialise(); } /** * Append all files that have a defined extension from a root directory * recursively to the sequence file. * * @param rootDirName Root directory * @param extensionFilter File extension filter */ public void initialise() { rootDir = new File(pc.getThreadDir()); sequenceFile = new File(pc.getThreadSeqFile()); uri = sequenceFile.getAbsolutePath(); } /** * Traverse the root directory recursively * * @param dir Root directory * @throws FileNotFoundException * @throws IOException */ private void traverseDir(File dir) throws FileNotFoundException, IOException { if (writer != null) { if (dir.isDirectory()) { String[] children = dir.list(); for (String child : children) { traverseDir(new File(dir, child)); } } else if (!dir.isDirectory()) { if (pc.getExtStr() == null || pc.getExtStr().equals("") || dir.getName().endsWith(pc.getExtStr())) { if (pc.isTextlinemode()) { writeTextLines(dir); } else { writeFileContent(dir); } } } } else { logger.error(this.getId() + ": " + "Writer not available"); } } private void writeTextLines(File file) throws FileNotFoundException, IOException { BufferedReader buffer = new BufferedReader(new FileReader(file)); String line; Text key = new Text(); key.set(file.getAbsolutePath()); Text value = new Text(); while ((line = buffer.readLine()) != null) { value.set(line); writer.append(key, value); logger.info(this.getId() + ": " + value); linecount++; } buffer.close(); } private void writeFileContent(File file) throws IOException { long fileLength = file.length(); if (fileLength <= Integer.MAX_VALUE) { Text key = new Text(); String filePath = file.getAbsolutePath(); String keyPath = FilenameUtils.separatorsToUnix(filePath); key.set(keyPath); FileInputStream fis = new FileInputStream(file); byte[] byteArray = new byte[(int) fileLength]; byte[] buf = new byte[BUFFER_SIZE]; int bytesRead = fis.read(buf); int offset = 0; int chunk_count = 0; while (bytesRead != -1) { System.arraycopy(buf, 0, byteArray, offset, bytesRead); offset += bytesRead; bytesRead = fis.read(buf); chunk_count++; } BytesWritable value = new BytesWritable(byteArray); int len = (int) fileLength; value.setSize(len); filecount++; logger.info(this.getId() + ": " + filecount + ":" + key); writer.append(key, value); fis.close(); } else { logger.warn("File " + file.getAbsolutePath() + " is too large to be " + "added to a sequence file (skipped)."); } } public long getFilecount() { return filecount; } public long getLinecount() { return linecount; } public boolean isTextlinemode() { return pc.isTextlinemode(); } @Override public void run() { try { FileSystem fs = FileSystem.get(URI.create(uri), conf); path = new Path(uri); Class keyClass = Text.class; Class valueClass = BytesWritable.class; if (pc.isTextlinemode()) { keyClass = Text.class; valueClass = Text.class; } writer = SequenceFile.createWriter(fs, conf, path, keyClass, valueClass, CompressionType.get(pc.getCompressionType())); traverseDir(rootDir); } catch (Exception e) { logger.error(this.getId() + ": " + "IOException occurred", e); } finally { IOUtils.closeStream(writer); } } public String getId() { return pc.getThreadId(); } }