it.crs4.seal.common.CutText.java Source code

Java tutorial

Introduction

Here is the source code for it.crs4.seal.common.CutText.java

Source

// Copyright (C) 2011-2012 CRS4.
//
// This file is part of Seal.
//
// Seal is free software: you can redistribute it and/or modify it
// under the terms of the GNU General Public License as published by the Free
// Software Foundation, either version 3 of the License, or (at your option)
// any later version.
//
// Seal is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
// or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// for more details.
//
// You should have received a copy of the GNU General Public License along
// with Seal.  If not, see <http://www.gnu.org/licenses/>.

package it.crs4.seal.common;

import java.util.ArrayList;
import org.apache.hadoop.io.Text;

public class CutText {
    public static class FormatException extends Exception {
        private static final long serialVersionUID = 1L;

        public FormatException(String msg, Text record) {
            super(msg + "Record: " + record.toString());
        }
    }

    private final String delim;

    private ArrayList<Integer> columns;
    private String[] extractedFields;
    private int[] extractedFieldPositions;

    /**
     * @param delim:  delimiter string that separates fields within the records to be scanned.
     * @param cols:  the sorted, zero-based indices of the columns to be extracted from the records.
     */
    public CutText(String delimiter, int... cols) {
        if (delimiter.length() == 0)
            throw new IllegalArgumentException("empty string is an invalid delimiter");

        delim = delimiter;
        columns = new ArrayList<Integer>(cols.length);
        for (int c : cols)
            columns.add(c);

        if (columns.size() == 0)
            throw new IllegalArgumentException("no columns specified");

        // ensure the columns are in sorted order
        for (int i = 1; i < columns.size(); ++i) {
            if (columns.get(i - 1) >= columns.get(i))
                throw new IllegalArgumentException(
                        "specified columns must be in sorted order and must not contain duplicates");
        }

        // initialize index
        extractedFields = new String[columns.size()];
        extractedFieldPositions = new int[columns.size()];
        // initialize so that trying to access the fields before
        // calling loadRecord results in obviously bad stuff
        for (int i = 0; i < extractedFields.length; ++i) {
            extractedFields[i] = null;
            extractedFieldPositions[i] = -1;
        }
    }

    public void loadRecord(Text record) throws FormatException {
        int pos = 0; // the byte position within the record
        int fieldno = 0; // the field index within the record
        int colno = 0; // the index within the list of requested fields (columns)
        try {
            while (pos < record.getLength() && colno < columns.size()) // iterate over each field
            {
                int endpos = record.find(delim, pos); // the field's end position
                if (endpos < 0)
                    endpos = record.getLength();

                if (columns.get(colno) == fieldno) // if we're at a requested field
                {
                    extractedFields[colno] = Text.decode(record.getBytes(), pos, endpos - pos);
                    extractedFieldPositions[colno] = pos;
                    colno += 1; // advance column
                }

                pos = endpos + 1; // the next starting position is the current end + 1
                fieldno += 1;
            }
        } catch (java.nio.charset.CharacterCodingException e) {
            throw new FormatException("character coding exception.  Message: " + e.getMessage(), record);
        }

        if (colno < columns.size())
            throw new FormatException("Missing field(s) in record. Field " + colno + " (zero-based) not found.",
                    record);
    }

    public String getField(int i) {
        return extractedFields[i];
    }

    public int getNumFields() {
        return extractedFields.length;
    }

    public int getFieldPos(int i) {
        return extractedFieldPositions[i];
    }
}