be.ibridge.kettle.jsoup.JsoupInput.java Source code

Java tutorial

Introduction

Here is the source code for be.ibridge.kettle.jsoup.JsoupInput.java

Source

/* Copyright (c) 2007 Pentaho Corporation.  All rights reserved. 
* This software was developed by Pentaho Corporation and is provided under the terms 
* of the GNU Lesser General Public License, Version 2.1. You may not use 
* this file except in compliance with the license. If you need a copy of the license, 
* please go to http://www.gnu.org/licenses/lgpl-2.1.txt. The Original Code is Pentaho 
* Data Integration.  The Initial Developer is Samatar HASSAN.
*
* Software distributed under the GNU Lesser Public License is distributed on an "AS IS" 
* basis, WITHOUT WARRANTY OF ANY KIND, either express or  implied. Please refer to 
* the license for the specific language governing your rights and limitations.*/

package be.ibridge.kettle.jsoup;

import java.io.File;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.vfs.FileObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.ResultFile;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.fileinput.FileInputList;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;

/**
 * Read Jsoup files, parse them and convert them to rows and writes these to one or more output 
 * streams.
 * 
 * @author Samatar
 * @since 20-06-2010
 */
public class JsoupInput extends BaseStep implements StepInterface {
    private static Class<?> PKG = JsoupInputMeta.class; // for i18n purposes, needed by Translator2!!   $NON-NLS-1$

    private JsoupInputMeta meta;
    private JsoupInputData data;

    public JsoupInput(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
            Trans trans) {
        super(stepMeta, stepDataInterface, copyNr, transMeta, trans);
    }

    /**
     * Build an empty row based on the meta-data.
     * 
     * @return
     */
    private Object[] buildEmptyRow() {
        Object[] rowData = RowDataUtil.allocateRowData(data.outputRowMeta.size());

        return rowData;
    }

    private void handleMissingFiles() throws KettleException {
        List<FileObject> nonExistantFiles = data.files.getNonExistantFiles();
        if (nonExistantFiles.size() != 0) {
            String message = FileInputList.getRequiredFilesDescription(nonExistantFiles);
            log.logError(BaseMessages.getString(PKG, "JsoupInput.Log.RequiredFilesTitle"),
                    BaseMessages.getString(PKG, "JsoupInput.Log.RequiredFiles", message));

            throw new KettleException(BaseMessages.getString(PKG, "JsoupInput.Log.RequiredFilesMissing", message));
        }

        List<FileObject> nonAccessibleFiles = data.files.getNonAccessibleFiles();
        if (nonAccessibleFiles.size() != 0) {
            String message = FileInputList.getRequiredFilesDescription(nonAccessibleFiles);
            log.logError(BaseMessages.getString(PKG, "JsoupInput.Log.RequiredFilesTitle"),
                    BaseMessages.getString(PKG, "JsoupInput.Log.RequiredNotAccessibleFiles", message));

            throw new KettleException(
                    BaseMessages.getString(PKG, "JsoupInput.Log.RequiredNotAccessibleFilesMissing", message));
        }
    }

    private boolean ReadNextString() {

        try {
            data.readrow = getRow(); // Grab another row ...

            if (data.readrow == null) {
                // finished processing!
                if (log.isDetailed())
                    logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.FinishedProcessing"));
                return false;
            }

            if (first) {
                first = false;

                data.inputRowMeta = getInputRowMeta();
                data.outputRowMeta = data.inputRowMeta.clone();
                meta.getFields(data.outputRowMeta, getStepname(), null, null, this);

                // Get total previous fields
                data.totalpreviousfields = data.inputRowMeta.size();

                // Create convert meta-data objects that will contain Date & Number formatters
                data.convertRowMeta = data.outputRowMeta.clone();
                for (int i = 0; i < data.convertRowMeta.size(); i++)
                    data.convertRowMeta.getValueMeta(i).setType(ValueMetaInterface.TYPE_STRING);

                // For String to <type> conversions, we allocate a conversion meta data row as well...
                //
                data.convertRowMeta = data.outputRowMeta.clone();
                for (int i = 0; i < data.convertRowMeta.size(); i++) {
                    data.convertRowMeta.getValueMeta(i).setType(ValueMetaInterface.TYPE_STRING);
                }

                // Check if source field is provided
                if (Const.isEmpty(meta.getFieldValue())) {
                    logError(BaseMessages.getString(PKG, "JsoupInput.Log.NoField"));
                    throw new KettleException(BaseMessages.getString(PKG, "JsoupInput.Log.NoField"));
                }

                // cache the position of the field         
                if (data.indexSourceField < 0) {
                    data.indexSourceField = getInputRowMeta().indexOfValue(meta.getFieldValue());
                    if (data.indexSourceField < 0) {
                        // The field is unreachable !
                        logError(BaseMessages.getString(PKG, "JsoupInput.Log.ErrorFindingField", //$NON-NLS-1$
                                meta.getFieldValue())); //$NON-NLS-2$
                        throw new KettleException(BaseMessages.getString(PKG,
                                "JsoupInput.Exception.CouldnotFindField", meta.getFieldValue())); //$NON-NLS-1$ //$NON-NLS-2$
                    }
                }

            }

            // get source field value
            String fieldValue = getInputRowMeta().getString(data.readrow, data.indexSourceField);

            if (log.isDetailed())
                logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.SourceValue", meta.getFieldValue(),
                        fieldValue));

            if (meta.getIsAFile()) {

                // source is a file.
                data.file = KettleVFS.getFileObject(fieldValue, getTransMeta());
                if (meta.isIgnoreEmptyFile() && data.file.getContent().getSize() == 0) {
                    // log only basic as a warning (was before logError)
                    logBasic(BaseMessages.getString(PKG, "JsoupInput.Error.FileSizeZero", data.file.getName()));
                    ReadNextString();
                }
            } else {
                // read string
                data.stringToParse = fieldValue;
            }

            readFileOrString();
        } catch (Exception e) {
            logError(BaseMessages.getString(PKG, "JsoupInput.Log.UnexpectedError", e.toString()));
            stopAll();
            logError(Const.getStackTracker(e));
            setErrors(1);
            return false;
        }
        return true;

    }

    private void addFileToResultFilesname(FileObject file) throws Exception {
        if (meta.addResultFile()) {
            // Add this to the result file names...
            ResultFile resultFile = new ResultFile(ResultFile.FILE_TYPE_GENERAL, file, getTransMeta().getName(),
                    getStepname());
            resultFile.setComment(BaseMessages.getString(PKG, "JsoupInput.Log.FileAddedResult"));
            addResultFile(resultFile);
        }
    }

    private boolean openNextFile() {
        try {
            if (data.filenr >= data.files.nrOfFiles()) {
                if (log.isDetailed())
                    logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.FinishedProcessing"));
                return false;
            }
            // Close previous file if needed
            if (data.file != null)
                data.file.close();
            // get file
            data.file = (FileObject) data.files.getFile(data.filenr);
            if (meta.isIgnoreEmptyFile() && data.file.getContent().getSize() == 0) {
                // log only basic as a warning (was before logError)
                logBasic(BaseMessages.getString(PKG, "JsoupInput.Error.FileSizeZero", "" + data.file.getName()));
                openNextFile();
            }
            readFileOrString();
        } catch (Exception e) {
            logError(BaseMessages.getString(PKG, "JsoupInput.Log.UnableToOpenFile", "" + data.filenr,
                    data.file.toString(), e.toString()));
            stopAll();
            setErrors(1);
            return false;
        }
        return true;
    }

    private void readFileOrString() throws Exception {
        if (data.file != null) {
            data.filename = KettleVFS.getFilename(data.file);
            // Add additional fields?
            if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) {
                data.shortFilename = data.file.getName().getBaseName();
            }
            if (meta.getPathField() != null && meta.getPathField().length() > 0) {
                data.path = KettleVFS.getFilename(data.file.getParent());
            }
            if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) {
                data.hidden = data.file.isHidden();
            }
            if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) {
                data.extension = data.file.getName().getExtension();
            }
            if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) {
                data.lastModificationDateTime = new Date(data.file.getContent().getLastModifiedTime());
            }
            if (meta.getUriField() != null && meta.getUriField().length() > 0) {
                data.uriName = data.file.getName().getURI();
            }
            if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) {
                data.rootUriName = data.file.getName().getRootURI();
            }
            // Check if file is empty
            long fileSize = data.file.getContent().getSize();

            if (meta.getSizeField() != null && meta.getSizeField().length() > 0) {
                data.size = fileSize;
            }
            // Move file pointer ahead!
            data.filenr++;

            if (log.isDetailed())
                logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.OpeningFile", data.file.toString()));

            addFileToResultFilesname(data.file);
        }

        parseJsoup();

    }

    private void parseJsoup() throws Exception {

        // Read JSOUP source
        if (data.file != null) {
            data.jsoupReader = Jsoup.parse(new File(data.filename), "UTF-8");
        } else {
            if (meta.isReadUrl()) {
                data.jsoupReader = Jsoup.parse(new URL(data.stringToParse), 1000);
            } else {
                // read string
                data.jsoupReader = Jsoup.parse(data.stringToParse);
            }
        }
        List<Elements> resultList = new ArrayList<Elements>();
        data.nrrecords = -1;
        data.recordnr = 0;
        String prevPath = "";
        for (int i = 0; i < data.nrInputFields; i++) {
            String path = meta.getInputFields()[i].getPath();
            Elements ja = data.jsoupReader.select(path);
            if (ja.size() > 0 && (data.nrrecords != -1 && data.nrrecords != ja.size() && ja != null)) {
                throw new KettleException(BaseMessages.getString(PKG, "JsoupInput.Error.BadStructure", ja.size(),
                        path, prevPath, data.nrrecords));
            }
            resultList.add(ja);
            if (data.nrrecords == -1 && ja != null) {
                data.nrrecords = ja.size();
            }
            prevPath = path;
        }

        data.resultList = new ArrayList<Elements>();

        Iterator<Elements> it = resultList.iterator();

        while (it.hasNext()) {
            Elements j = it.next();
            if (j == null || j.size() == 0) {
                if (data.nrrecords == -1) {
                    data.nrrecords = 1;
                }
                // The object is empty means that we do not
                // find Jsoup path
                // We need here to create a dummy structure
                j = new Elements();
                for (int i = 0; i < data.nrrecords; i++) {
                    j.add(null);
                }
            }
            data.resultList.add(j);
        }
        resultList = null;

        if (log.isDetailed()) {
            logDetailed(BaseMessages.getString(PKG, "JsoupInput.Log.NrRecords", data.nrrecords));
        }

    }

    public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException {
        if (first && !meta.isInFields()) {
            first = false;

            data.files = meta.getFiles(this);

            if (!meta.isdoNotFailIfNoFile() && data.files.nrOfFiles() == 0) {
                throw new KettleException(BaseMessages.getString(PKG, "JsoupInput.Log.NoFiles"));
            }

            handleMissingFiles();

            // Create the output row meta-data
            data.outputRowMeta = new RowMeta();

            meta.getFields(data.outputRowMeta, getStepname(), null, null, this);

            // Create convert meta-data objects that will contain Date & Number formatters
            data.convertRowMeta = data.outputRowMeta.clone();
            for (int i = 0; i < data.convertRowMeta.size(); i++)
                data.convertRowMeta.getValueMeta(i).setType(ValueMetaInterface.TYPE_STRING);

            // For String to <type> conversions, we allocate a conversion meta data row as well...
            //
            data.convertRowMeta = data.outputRowMeta.clone();
            for (int i = 0; i < data.convertRowMeta.size(); i++) {
                data.convertRowMeta.getValueMeta(i).setType(ValueMetaInterface.TYPE_STRING);
            }
        }
        Object[] r = null;
        try {
            // Grab a row
            r = getOneRow();
            if (r == null) {
                setOutputDone(); // signal end to receiver(s)
                return false; // end of data or error.
            }

            if (log.isRowLevel())
                logRowlevel(BaseMessages.getString(PKG, "JsoupInput.Log.ReadRow", data.outputRowMeta.getString(r)));
            incrementLinesInput();
            data.rownr++;

            putRow(data.outputRowMeta, r); // copy row to output rowset(s);

            if (meta.getRowLimit() > 0 && data.rownr > meta.getRowLimit()) {
                // limit has been reached: stop now.
                setOutputDone();
                return false;
            }

        } catch (Exception e) {
            boolean sendToErrorRow = false;
            String errorMessage = null;
            if (getStepMeta().isDoingErrorHandling()) {
                sendToErrorRow = true;
                errorMessage = e.toString();
            } else {
                logError(BaseMessages.getString(PKG, "JsoupInput.ErrorInStepRunning", e.getMessage())); //$NON-NLS-1$
                setErrors(1);
                stopAll();
                setOutputDone(); // signal end to receiver(s)
                return false;
            }
            if (sendToErrorRow) {
                // Simply add this row to the error row
                putError(getInputRowMeta(), r, 1, errorMessage, null, "JsoupInput001");
            }

        }
        return true;
    }

    private Object[] getOneRow() throws KettleException {

        if (!meta.isInFields()) {
            while ((data.recordnr >= data.nrrecords || data.file == null)) {
                if (!openNextFile()) {
                    return null;
                }
            }
        } else {
            while ((data.recordnr >= data.nrrecords || data.readrow == null)) {
                if (!ReadNextString()) {
                    return null;
                }
                if (data.readrow == null) {
                    return null;
                }
            }
        }

        return buildRow();
    }

    private Object[] buildRow() throws KettleException {
        // Create new row...
        Object[] outputRowData = buildEmptyRow();

        if (data.readrow != null)
            outputRowData = data.readrow.clone();

        // Read fields...
        for (int i = 0; i < data.nrInputFields; i++) {
            // Get field
            JsoupInputField field = meta.getInputFields()[i];

            // get jsoup array for field
            Elements jsoupa = data.resultList.get(i);
            String nodevalue = null;
            if (jsoupa != null) {
                Element jo = jsoupa.get(data.recordnr);
                if (jo != null) {

                    // Do Element Type
                    switch (field.getElementType()) {
                    case JsoupInputField.ELEMENT_TYPE_NODE:
                        // Do Result Type
                        switch (field.getResultType()) {
                        case JsoupInputField.RESULT_TYPE_TEXT:
                            nodevalue = jo.text();
                            break;
                        case JsoupInputField.RESULT_TYPE_TYPE_OUTER_HTML:
                            nodevalue = jo.outerHtml();
                            break;
                        case JsoupInputField.RESULT_TYPE_TYPE_INNER_HTML:
                            nodevalue = jo.html();
                            break;
                        default:
                            nodevalue = jo.toString();
                            break;
                        }
                        break;
                    case JsoupInputField.ELEMENT_TYPE_ATTRIBUT:
                        nodevalue = jo.attr(field.getAttribute());
                        break;
                    default:
                        nodevalue = jo.toString();
                        break;
                    }
                }
            }

            // Do trimming
            switch (field.getTrimType()) {
            case JsoupInputField.TYPE_TRIM_LEFT:
                nodevalue = Const.ltrim(nodevalue);
                break;
            case JsoupInputField.TYPE_TRIM_RIGHT:
                nodevalue = Const.rtrim(nodevalue);
                break;
            case JsoupInputField.TYPE_TRIM_BOTH:
                nodevalue = Const.trim(nodevalue);
                break;
            default:
                break;
            }

            if (meta.isInFields()) {
                // Add result field to input stream
                outputRowData = RowDataUtil.addValueData(outputRowData, data.totalpreviousfields + i, nodevalue);
            }
            // Do conversions
            //
            ValueMetaInterface targetValueMeta = data.outputRowMeta.getValueMeta(data.totalpreviousfields + i);
            ValueMetaInterface sourceValueMeta = data.convertRowMeta.getValueMeta(data.totalpreviousfields + i);
            outputRowData[data.totalpreviousfields + i] = targetValueMeta.convertData(sourceValueMeta, nodevalue);

            // Do we need to repeat this field if it is null?
            if (meta.getInputFields()[i].isRepeated()) {
                if (data.previousRow != null && Const.isEmpty(nodevalue)) {
                    outputRowData[data.totalpreviousfields + i] = data.previousRow[data.totalpreviousfields + i];
                }
            }
        } // End of loop over fields...   

        int rowIndex = data.nrInputFields;

        // See if we need to add the filename to the row...
        if (meta.includeFilename() && !Const.isEmpty(meta.getFilenameField())) {
            outputRowData[rowIndex++] = data.filename;
        }
        // See if we need to add the row number to the row...  
        if (meta.includeRowNumber() && !Const.isEmpty(meta.getRowNumberField())) {
            outputRowData[rowIndex++] = new Long(data.rownr);
        }
        // Possibly add short filename...
        if (meta.getShortFileNameField() != null && meta.getShortFileNameField().length() > 0) {
            outputRowData[rowIndex++] = data.shortFilename;
        }
        // Add Extension
        if (meta.getExtensionField() != null && meta.getExtensionField().length() > 0) {
            outputRowData[rowIndex++] = data.extension;
        }
        // add path
        if (meta.getPathField() != null && meta.getPathField().length() > 0) {
            outputRowData[rowIndex++] = data.path;
        }
        // Add Size
        if (meta.getSizeField() != null && meta.getSizeField().length() > 0) {
            outputRowData[rowIndex++] = new Long(data.size);
        }
        // add Hidden
        if (meta.isHiddenField() != null && meta.isHiddenField().length() > 0) {
            outputRowData[rowIndex++] = new Boolean(data.path);
        }
        // Add modification date
        if (meta.getLastModificationDateField() != null && meta.getLastModificationDateField().length() > 0) {
            outputRowData[rowIndex++] = data.lastModificationDateTime;
        }
        // Add Uri
        if (meta.getUriField() != null && meta.getUriField().length() > 0) {
            outputRowData[rowIndex++] = data.uriName;
        }
        // Add RootUri
        if (meta.getRootUriField() != null && meta.getRootUriField().length() > 0) {
            outputRowData[rowIndex++] = data.rootUriName;
        }
        data.recordnr++;

        RowMetaInterface irow = getInputRowMeta();

        data.previousRow = irow == null ? outputRowData : (Object[]) irow.cloneRow(outputRowData); // copy it to make
        // surely the next step doesn't change it in between...

        return outputRowData;
    }

    public boolean init(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (JsoupInputMeta) smi;
        data = (JsoupInputData) sdi;

        if (super.init(smi, sdi)) {
            data.rownr = 1L;
            data.nrInputFields = meta.getInputFields().length;
            // Take care of variable substitution
            for (int i = 0; i < data.nrInputFields; i++) {
                JsoupInputField field = meta.getInputFields()[i];
                field.setPath(environmentSubstitute(field.getPath()));
            }

            // Init a new JSOUP reader
            data.jsoupReader = new Document("");

            return true;
        }
        return false;
    }

    public void dispose(StepMetaInterface smi, StepDataInterface sdi) {
        meta = (JsoupInputMeta) smi;
        data = (JsoupInputData) sdi;
        if (data.file != null) {
            try {
                data.file.close();
            } catch (Exception e) {
            }
        }
        data.resultList = null;
        super.dispose(smi, sdi);
    }
}