Java tutorial
/* * Copyright 2006-2013 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package batch.demo.job; import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.util.LinkedHashMap; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.batch.core.partition.support.Partitioner; import org.springframework.batch.item.ExecutionContext; import org.springframework.core.io.Resource; import org.springframework.util.Assert; /** * Creates a set of partitions for a flat text file. * <p/> * Assumes that each record is stored on one and only one line. * Reads the file's byte stream detecting line ends and creates partitions * splitted at the new line border. Populates the {@link ExecutionContext} with * the byte offset for each partition thread and number of items/lines to be read from that position. * <p/> * Can be used to read the file concurrently. Each partition thread should use the byte offset specified by the * <tt>startAt</tt> * offset to set cursor at the starting position and a number of items (lines) to read as defined * by the <tt>itemsCount</tt> property. * * @author Sergey Shcherbakov * @author Stephane Nicoll */ public class FlatFilePartitioner implements Partitioner { /** * The {@link ExecutionContext} key name for the number of bytes the partition should skip on startup. */ public static final String DEFAULT_START_AT_KEY = "startAt"; /** * The {@link ExecutionContext} key name for number of items/lines to read in the partition. */ public static final String DEFAULT_ITEMS_COUNT_KEY = "itemsCount"; /** * The {@link ExecutionContext} key name for the file resource which has been used for partitioning. */ public static final String DEFAULT_RESOURCE_KEY = "resource"; /** * The common partition prefix name to use. */ public static final String PARTITION_PREFIX = "partition-"; private final Logger logger = LoggerFactory.getLogger(FlatFilePartitioner.class); private Resource resource; private String startAtKeyName = DEFAULT_START_AT_KEY; private String itemsCountKeyName = DEFAULT_ITEMS_COUNT_KEY; private String resourceKeyName = DEFAULT_RESOURCE_KEY; /** * The name of the key for the byte offset in each {@link ExecutionContext}. * Defaults to "startAt". * @param keyName the value of the key */ public void setStartAtKeyName(String keyName) { this.startAtKeyName = keyName; } /** * The name of the key for the byte offset in each {@link ExecutionContext}. * Defaults to "itemsCount". * @param keyName the value of the key */ public void setItemsCountKeyName(String keyName) { this.itemsCountKeyName = keyName; } /** * The name of the key for the file name in each {@link ExecutionContext}. * Defaults to "resource". * @param keyName the value of the key */ public void setResourceKeyName(String keyName) { this.resourceKeyName = keyName; } /** * Creates a set of {@link ExecutionContext} according to the provided * <tt>gridSize</tt> if there are enough elements. * <p/> * First computes the total number of items to process for the resource * and then split equality these in each partition. The returned context * hold the {@link #DEFAULT_START_AT_KEY} and {@link #DEFAULT_ITEMS_COUNT_KEY} properties * defining the number of elements to skip and the number of elements to * read respectively. * * @param gridSize the requested size of the grid * @return the execution contexts * @see #countItems(org.springframework.core.io.Resource) */ public Map<String, ExecutionContext> partition(int gridSize) { Assert.isTrue(gridSize > 0, "Grid size must be greater than 0"); checkResource(this.resource); if (logger.isDebugEnabled()) { logger.debug("Splitting [" + resource.getDescription() + "]"); } try { final Map<String, ExecutionContext> result = new LinkedHashMap<String, ExecutionContext>(); final long sizeInBytes = resource.contentLength(); if (sizeInBytes == 0) { logger.info("Empty input file [" + resource.getDescription() + "] no partition will be created."); return result; } PartitionBorderCursor partitionCursor = new PartitionBorderCursor(gridSize, sizeInBytes); // Check the case that the set is to small for the number of request partition(s) if (partitionCursor.getBytesPerPartition() == 0) { long lines = countItems(resource); logger.info("Not enough data (" + lines + ") for the requested gridSize [" + gridSize + "]"); partitionCursor.createPartition(0, lines, result); return result; } if (logger.isDebugEnabled()) { logger.debug("Has to split [" + sizeInBytes + "] byte(s) in [" + gridSize + "] " + "grid(s) (" + partitionCursor.getBytesPerPartition() + " each)"); } final int BUFFER_SIZE = 4096; final InputStream in = resource.getInputStream(); try { final InputStream is = new BufferedInputStream(in); byte[] c = new byte[BUFFER_SIZE]; ByteStreamCursor byteCursor = new ByteStreamCursor(); int readChars; while ((readChars = is.read(c)) != -1) { for (int i = 0; i < readChars; ++i) { if (byteCursor.lastSeenCharIsNewline(c[i])) { if (byteCursor.getCurrentByteInd() > partitionCursor.getPartitionBorder()) { partitionCursor.createPartition(byteCursor.getStartAt(), byteCursor.getLineCount(), result); byteCursor.startNewPartition(); } } } } if (byteCursor.lastLineUnterminated()) { byteCursor.startNewLine(); } if (byteCursor.outstandingData()) { partitionCursor.createPartition(byteCursor.getStartAt(), byteCursor.getLineCount(), result); } return result; } finally { in.close(); } } catch (IOException e) { throw new IllegalStateException( "Unexpected IO exception while partitioning [" + resource.getDescription() + "]", e); } } /** * This is a helper class to simplify the byte stream iterating code. * Tracks current location in the byte stream, number of lines counted from the * last partition start and from the input stream beginning. * Increments indexes on a new character read. * Detects the new line character and updates counters. */ private static class ByteStreamCursor { private long totalLineCount = 0; private long lineCount = 0; private byte lastSeenChar = 0; private long currentByteInd = 0L; private long startAt = 0; public boolean lastSeenCharIsNewline(byte lastSeenChar) { this.lastSeenChar = lastSeenChar; this.currentByteInd++; // New line is \n on Unix and \r\n on Windows if (lastSeenChar == '\n') { startNewLine(); return true; } return false; } public void startNewLine() { lineCount++; totalLineCount++; } public void startNewPartition() { startAt = currentByteInd; lineCount = 0; } public long getLineCount() { return lineCount; } public long getStartAt() { return startAt; } public long getCurrentByteInd() { return currentByteInd; } public boolean lastLineUnterminated() { return (totalLineCount > 0 && lastSeenChar != '\n') || // <-- last line is not empty but is not terminated by '\n' (totalLineCount == 0 && lastSeenChar != '\n' && currentByteInd > 0); // <-- the first line is the last line and it's not terminated by '\n' } public boolean outstandingData() { return currentByteInd > 0 && startAt != currentByteInd; } } /** * This is a helper class to simplify the byte stream iterating code. * Tracks the location of approximate byte offsets that split the input file into * approximately (+/-1) equal byte partitions. * When the main iteration passes this border the next partition will be created as soon * as the next new line character or end of stream is detected. */ private class PartitionBorderCursor { private int gridSize; private final long bytesPerPartition; private final long bytesRemainder; private long remainderCounter; private long partitionBorder; private int partitionIndex; PartitionBorderCursor(int gridSize, long sizeInBytes) { this.gridSize = gridSize; this.bytesPerPartition = sizeInBytes / gridSize; this.bytesRemainder = sizeInBytes % gridSize; this.remainderCounter = this.bytesRemainder; this.partitionBorder = 0; this.partitionIndex = 0; toNextPartitionBorder(); } public long getBytesPerPartition() { return bytesPerPartition; } public long getPartitionBorder() { return this.partitionBorder; } private void toNextPartitionBorder() { this.partitionBorder += bytesPerPartition + (remainderCounter-- > 0 ? 1 : 0); } public void createPartition(long startAt, long lineCount, final Map<String, ExecutionContext> result) { final String partitionName = getPartitionName(gridSize, partitionIndex++); result.put(partitionName, createExecutionContext(partitionName, startAt, lineCount)); toNextPartitionBorder(); } private String getPartitionName(int gridSize, int partitionIndex) { final String partitionNumberFormat = "%0" + String.valueOf(gridSize).length() + "d"; return PARTITION_PREFIX + String.format(partitionNumberFormat, partitionIndex); } } /** * Creates a standard {@link ExecutionContext} with the specified parameters. * @param partitionName the name of the partition * @param startAt the number of bytes for a partition thread to skip before starting reading * @param itemsCount the number of items to read * @return the execution context (output) */ protected ExecutionContext createExecutionContext(String partitionName, long startAt, long itemsCount) { final ExecutionContext executionContext = new ExecutionContext(); executionContext.putLong(startAtKeyName, startAt); executionContext.putLong(itemsCountKeyName, itemsCount); try { executionContext.putString(resourceKeyName, "file:" + resource.getFile().getPath()); } catch (IOException e) { throw new IllegalArgumentException("File could not be located for: " + resource, e); } if (logger.isDebugEnabled()) { logger.debug("Added partition [" + partitionName + "] with [" + executionContext + "]"); } return executionContext; } /** * Returns the number of elements in the specified {@link Resource}. * * @param resource the resource * @return the number of items contained in the resource */ protected long countItems(Resource resource) { try { final InputStream in = resource.getInputStream(); try { return countLines(in); } finally { in.close(); } } catch (IOException e) { throw new IllegalStateException( "Unexpected IO exception while counting items for [" + resource.getDescription() + "]", e); } } /** * Returns the number of lines found in the specified stream. * <p/> * The caller is responsible to close the stream. * * Up to 5 times faster than using BufferedReader and up to 2 times faster * than LineNumberReader. * * @param in the input stream to use * @return the number of lines found in the stream * @throws IOException if an error occurred */ public static long countLines(InputStream in) throws IOException { final InputStream is = new BufferedInputStream(in); byte[] c = new byte[4096]; long count = 0; int readChars; byte lastChar = 0; boolean contentExists = false; while ((readChars = is.read(c)) != -1) { for (int i = 0; i < readChars; ++i) { contentExists = true; lastChar = c[i]; // We're dealing with the char here, it's \n on Unix and \r\n on Windows if (c[i] == '\n') ++count; } } // Last line if ((count > 0 && lastChar != '\n') || // <-- last line is not empty but is not terminated by '\n' (count == 0 && lastChar != '\n' && contentExists)) { // <-- the first line is the last line and it's not terminated by '\n' count++; } return count; } /** * Checks whether the specified {@link Resource} is valid. * * @param resource the resource to check * @throws IllegalStateException if the resource is invalid */ protected void checkResource(Resource resource) { Assert.notNull(resource, "Resource is not set"); if (!resource.exists()) { throw new IllegalStateException("Input resource must exist: " + resource); } if (!resource.isReadable()) { throw new IllegalStateException("Input resource must be readable: " + resource); } } /** * Sets the input {@link Resource} to use. * * @param resource the resource to partition */ public void setResource(Resource resource) { this.resource = resource; } }