com.mycompany.textextract.textExtract.java Source code

Java tutorial

Introduction

Here is the source code for com.mycompany.textextract.textExtract.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.mycompany.textextract;

import java.io.File;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 *Takes a folder of pdf documents and a target, and produces text documents of their contents. If there is a text document with the same name already in the folder, 
 * it skips that file.
 * @author sschick
 */
public class textExtract {
    public static void main(String[] args) throws IOException {
        File folder = new File(args[0]);
        File target = new File(args[1]);
        String[] files = folder.list();
        for (String filename : files) {
            if (new File(filename).isFile() && !Arrays.asList(target.list()).contains(filename + ".txt")) {
                try {
                    PDDocument doc = PDDocument.load(folder.getAbsolutePath() + "/" + filename);
                    PDFTextStripper strip = new PDFTextStripper();
                    PrintWriter writer = new PrintWriter(target.getAbsolutePath() + "/" + filename + ".txt",
                            "UTF-8");
                    writer.print(strip.getText(doc));
                    doc.close();
                    writer.close();
                } catch (Exception e) {
                    System.out.println("writing failed-" + e.toString());
                }
            }
        }
    }
}