Example usage for org.apache.pdfbox.pdfparser PDFParser getPDDocument

List of usage examples for org.apache.pdfbox.pdfparser PDFParser getPDDocument

Introduction

In this page you can find the example usage for org.apache.pdfbox.pdfparser PDFParser getPDDocument.

Prototype

public PDDocument getPDDocument() throws IOException 

Source Link

Document

This will get the PD document that was parsed.

Usage

From source file:com.aurel.track.lucene.index.associatedFields.textExctractor.PdfExtractor.java

License:Open Source License

/**
 * Gets the text from file content /*from  ww  w. ja v a2s.  c  o m*/
 * @param file
 * @param fileExtension
 * @return
 */
@Override
public String getText(File file, String fileExtension) {
    FileInputStream fis = null;
    PDDocument pdDoc = null;
    StringWriter stringWriter = null;
    try {
        fis = new FileInputStream(file);
        PDFParser parser = new PDFParser(fis);
        parser.parse();
        pdDoc = parser.getPDDocument();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.setLineSeparator("\n");
        stringWriter = new StringWriter();
        stripper.writeText(pdDoc, stringWriter);
        return stringWriter.toString();
    } catch (Exception e) {
        if (LOGGER.isDebugEnabled()) {
            LOGGER.debug(
                    "Extracting text from the .pdf  file " + file.getName() + " failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
        }
    } finally {
        try {
            if (stringWriter != null) {
                stringWriter.close();
            }
        } catch (Exception e) {
        }
        try {
            if (pdDoc != null) {
                pdDoc.close();
            }
        } catch (Exception e) {
            LOGGER.info("Closing pdDoc for " + file + " failed with " + e.getMessage());
            LOGGER.debug(ExceptionUtils.getStackTrace(e));
        }
        try {
            if (fis != null) {
                fis.close();
            }
        } catch (Exception e) {
            LOGGER.info("Closing the FileInputStream for " + file + " failed with " + e.getMessage());
        }
    }
    return null;
}

From source file:com.exlibris.dps.repository.plugin.riskExtractor.drmlint.PDFBoxWrapper.java

License:Apache License

/**
 * Check if a PDF file is valid or not//w w w.j a  v  a  2  s. c  om
 * @param pFile file to check
 * @return whether the file is valid or not
 */
public static boolean isValid(File pFile) {
    boolean ret = false;
    try {
        PDFParser parser = new PDFParser(new FileInputStream(pFile));
        parser.parse();
        File temp = File.createTempFile("drmlint-temp-", ".pdf");
        parser.getPDDocument().save(temp);
        parser.getDocument().close();
        temp.delete();
        ret = true;
    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (COSVisitorException e) {
        // TODO Auto-generated catch block
        ret = false;
    }
    return ret;
}

From source file:com.liferay.portal.util.LuceneFields.java

License:Open Source License

public static Field getFile(String field, File file, String fileExt) throws IOException {

    fileExt = fileExt.toLowerCase();/*from   w  w  w .ja  v a2  s .  c o  m*/

    FileInputStream fis = new FileInputStream(file);
    Reader reader = new BufferedReader(new InputStreamReader(fis));

    String text = null;

    if (fileExt.equals(".doc")) {
        try {
            WordDocument wordDocument = new WordDocument(fis);

            StringWriter stringWriter = new StringWriter();

            wordDocument.writeAllText(stringWriter);

            text = stringWriter.toString();

            stringWriter.close();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".htm") || fileExt.equals(".html")) {
        try {
            DefaultStyledDocument dsd = new DefaultStyledDocument();

            HTMLEditorKit htmlEditorKit = new HTMLEditorKit();
            htmlEditorKit.read(reader, dsd, 0);

            text = dsd.getText(0, dsd.getLength());
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".pdf")) {
        try {
            PDFParser parser = new PDFParser(fis);
            parser.parse();

            PDDocument pdDoc = parser.getPDDocument();

            StringWriter stringWriter = new StringWriter();

            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(pdDoc, stringWriter);

            text = stringWriter.toString();

            stringWriter.close();
            pdDoc.close();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".rtf")) {
        try {
            DefaultStyledDocument dsd = new DefaultStyledDocument();

            RTFEditorKit rtfEditorKit = new RTFEditorKit();
            rtfEditorKit.read(reader, dsd, 0);

            text = dsd.getText(0, dsd.getLength());
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    } else if (fileExt.equals(".xls")) {
        try {
            XLSTextStripper stripper = new XLSTextStripper(fis);

            text = stripper.getText();
        } catch (Exception e) {
            _log.error(e.getMessage());
        }
    }

    if (text != null) {
        return new Field(field, text, Field.Store.YES, Field.Index.NOT_ANALYZED);
    } else {
        return new Field(field, reader);
    }
}

From source file:com.openkm.extractor.PdfTextExtractor.java

License:Open Source License

/**
 * {@inheritDoc}/*w  w  w  . j  a v a2s  .  c  o  m*/
 */
@SuppressWarnings("rawtypes")
public String extractText(InputStream stream, String type, String encoding) throws IOException {
    try {
        PDFParser parser = new PDFParser(new BufferedInputStream(stream));

        try {
            parser.parse();
            PDDocument document = parser.getPDDocument();

            if (document.isEncrypted()) {
                try {
                    document.decrypt("");
                    document.setAllSecurityToBeRemoved(true);
                } catch (Exception e) {
                    throw new IOException("Unable to extract text: document encrypted", e);
                }
            }

            CharArrayWriter writer = new CharArrayWriter();
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setLineSeparator("\n");
            stripper.writeText(document, writer);
            String st = writer.toString().trim();
            log.debug("TextStripped: '{}'", st);

            if (Config.SYSTEM_PDF_FORCE_OCR || st.length() <= 1) {
                log.warn("PDF does not contains text layer");

                // Extract images from PDF
                StringBuilder sb = new StringBuilder();

                if (!Config.SYSTEM_PDFIMAGES.isEmpty()) {
                    File tmpPdf = FileUtils.createTempFile("pdf");
                    File tmpDir = new File(EnvironmentDetector.getTempDir());
                    String baseName = FileUtils.getFileName(tmpPdf.getName());
                    document.save(tmpPdf);
                    int pgNum = 1;

                    try {
                        for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                            HashMap<String, Object> hm = new HashMap<String, Object>();
                            hm.put("fileIn", tmpPdf.getPath());
                            hm.put("firstPage", pgNum);
                            hm.put("lastPage", pgNum++);
                            hm.put("imageRoot", tmpDir + File.separator + baseName);
                            String cmd = TemplateUtils.replace("SYSTEM_PDFIMAGES", Config.SYSTEM_PDFIMAGES, hm);
                            ExecutionUtils.runCmd(cmd);

                            for (File tmp : tmpDir.listFiles()) {
                                if (tmp.getName().startsWith(baseName + "-")) {
                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(tmp, tmp, page.findRotation());
                                    }

                                    try {
                                        String txt = doOcr(tmp);
                                        sb.append(txt).append(" ");
                                        log.debug("OCR Extracted: {}", txt);
                                    } finally {
                                        FileUtils.deleteQuietly(tmp);
                                    }
                                }
                            }
                        }
                    } finally {
                        FileUtils.deleteQuietly(tmpPdf);
                    }
                } else {
                    for (PDPage page : (List<PDPage>) document.getDocumentCatalog().getAllPages()) {
                        PDResources resources = page.getResources();
                        Map<String, PDXObject> images = resources.getXObjects();

                        if (images != null) {
                            for (String key : images.keySet()) {
                                PDXObjectImage image = (PDXObjectImage) images.get(key);
                                String prefix = "img-" + key + "-";
                                File pdfImg = null;

                                try {
                                    pdfImg = File.createTempFile(prefix, ".png");
                                    log.debug("Writing image: {}", pdfImg.getPath());

                                    // Won't work until PDFBox 1.8.9
                                    ImageIO.write(image.getRGBImage(), "png", pdfImg);

                                    if (page.findRotation() > 0) {
                                        ImageUtils.rotate(pdfImg, pdfImg, page.findRotation());
                                    }

                                    // Do OCR
                                    String txt = doOcr(pdfImg);
                                    sb.append(txt).append(" ");
                                    log.debug("OCR Extracted: {}", txt);
                                } finally {
                                    FileUtils.deleteQuietly(pdfImg);
                                }
                            }
                        }
                    }
                }

                return sb.toString();
            } else {
                return writer.toString();
            }
        } finally {
            try {
                PDDocument doc = parser.getPDDocument();
                if (doc != null) {
                    doc.close();
                }
            } catch (IOException e) {
                // ignore
            }
        }
    } catch (Exception e) {
        // it may happen that PDFParser throws a runtime
        // exception when parsing certain pdf documents
        log.warn("Failed to extract PDF text content", e);
        throw new IOException(e.getMessage(), e);
    } finally {
        stream.close();
    }
}

From source file:com.stimulus.archiva.extraction.PDFExtractor.java

License:Open Source License

public Reader getText(InputStream is, Charset charset, IndexInfo indexInfo) throws ExtractionException {
    logger.debug("extracting pdf file");
    File file = null;/*  ww  w  .  j  a va 2  s.co m*/
    PDDocument document = null;
    Writer output = null;
    try {
        PDFParser parser = new PDFParser(is);
        parser.parse();
        document = parser.getPDDocument();
        if (document.isEncrypted()) {
            DocumentEncryption decryptor = new DocumentEncryption(document);
            if (logger.isDebugEnabled()) {
                logger.debug("pdf document appears to be encrypted (will attempt decryption)");

            }
            decryptor.decryptDocument("");
        }
        file = File.createTempFile("extract_pdf", ".tmp");
        indexInfo.addDeleteFile(file);
        output = new OutputStreamWriter(new FileOutputStream(file), "UTF-8");
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(document, output);
        /*logger.debug("PDF extraction completed");
         BufferedReader reader;
         try {
            reader = new BufferedReader(new FileReader(file));
           String line = null;
           while( (line=reader.readLine()) != null) {
              logger.debug("PDF>"+line);
           }
           reader.close();
         } catch(Exception e) {
            logger.error("failed to open txt file",e);
         }*/
    } catch (Throwable e) {
        throw new ExtractionException("failed to extract pdf (probable password protected document)", e, logger,
                ChainedException.Level.DEBUG);
    } finally {
        try {
            if (document != null)
                document.close();
            if (output != null)
                output.close();
        } catch (IOException io) {
        }
    }
    try {
        logger.debug("returning extracted PDF data");
        Reader outReader = new FileReader(file);
        indexInfo.addReader(outReader);
        return outReader;
    } catch (Exception ex) {
        throw new ExtractionException("failed to extract text from powerpoint document", ex, logger,
                ChainedException.Level.DEBUG);
    }
}

From source file:cz.muni.pdfjbim.PdfImageExtractor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param prefix /* w  ww . j a v  a  2  s.  c  om*/
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet
//    * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String prefix, String password,
        Set<Integer> pagesToProcess, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException("pdfFile must be defined");
    }

    InputStream inputStream = null;
    if (password != null) {
        try {
            log.debug("PDF probably encrypted, trying to decrypt using given password {}", password);
            ByteArrayOutputStream decryptedOutputStream = new ByteArrayOutputStream();
            PdfReader reader = new PdfReader(pdfFile, password.getBytes(StandardCharsets.UTF_8));
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.entrySet().iterator();
                while (xobjIter.hasNext()) {
                    Map.Entry entry = (Map.Entry) xobjIter.next();
                    String key = (String) entry.getKey();
                    PDXObject xobj = (PDXObject) entry.getValue();
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.entrySet().iterator();
                        while (imageIter.hasNext()) {
                            Map.Entry imEntry = (Map.Entry) imageIter.next();
                            String imKey = (String) imEntry.getKey();
                            PDXObjectImage image = (PDXObjectImage) imEntry.getValue();

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List<COSName> filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1 && !binarize) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE)) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            if (filters.contains(COSName.JBIG2_DECODE)) {
                                if (skipJBig2Images) {
                                    log.warn("Allready compressed according to JBIG2 standard => skipping");
                                    continue;
                                } else {
                                    log.debug("JBIG2 image detected");
                                }
                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains(COSName.JPX_DECODE)) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }
                }
            }
        }
    } catch (IOException ex) {
        Tools.deleteFilesFromList(namesOfImages);
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } catch (RuntimeException ex) {
        Tools.deleteFilesFromList(namesOfImages);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:cz.muni.pdfjbim.PdfImageProcessor.java

License:Apache License

/**
 * @deprecated -- do not use doesn't work properly yet
 * This method extracts images by going through PDF tree structure
 * @param pdfFile name of input PDF file
 * @param password password for access to PDF if needed
 * @param pagesToProcess list of pages which should be processed if null given => processed all pages
 *      -- not working yet//from w  ww  .  j a  v  a  2s .  co  m
 * @param silent -- if true error messages are not written to output otherwise they are
 * @param binarize -- enables processing of nonbitonal images as well (LZW is still not
 *      processed because of output with inverted colors)
 * @throws PdfRecompressionException if problem to extract images from PDF
 */
public void extractImagesUsingPdfObjectAccess(String pdfFile, String password, Set<Integer> pagesToProcess,
        Boolean silent, Boolean binarize) throws PdfRecompressionException {
    if (binarize == null) {
        binarize = false;
    }
    // checking arguments and setting appropriate variables
    if (pdfFile == null) {
        throw new IllegalArgumentException(pdfFile);
    }

    String prefix = null;

    InputStream inputStream = null;
    if (password != null) {
        try {
            ByteArrayOutputStream decryptedOutputStream = null;
            PdfReader reader = new PdfReader(pdfFile, password.getBytes());
            PdfStamper stamper = new PdfStamper(reader, decryptedOutputStream);
            stamper.close();
            inputStream = new ByteArrayInputStream(decryptedOutputStream.toByteArray());
        } catch (DocumentException ex) {
            throw new PdfRecompressionException(ex);
        } catch (IOException ex) {
            throw new PdfRecompressionException("Reading file caused exception", ex);
        }
    } else {
        try {
            inputStream = new FileInputStream(pdfFile);
        } catch (FileNotFoundException ex) {
            throw new PdfRecompressionException("File wasn't found", ex);
        }
    }

    // if prefix is not set then prefix set to name of pdf without .pdf
    // if pdfFile has unconsistent name (without suffix .pdf) and name longer than 4 chars then last for chars are removed
    // and this string set as prefix
    if ((prefix == null) && (pdfFile.length() > 4)) {
        prefix = pdfFile.substring(0, pdfFile.length() - 4);
    }

    PDFParser parser = null;
    PDDocument doc = null;
    try {
        parser = new PDFParser(inputStream);
        parser.parse();
        doc = parser.getPDDocument();

        AccessPermission accessPermissions = doc.getCurrentAccessPermission();

        if (!accessPermissions.canExtractContent()) {
            throw new PdfRecompressionException("Error: You do not have permission to extract images.");
        }

        // going page by page
        List pages = doc.getDocumentCatalog().getAllPages();
        for (int pageNumber = 0; pageNumber < pages.size(); pageNumber++) {
            if ((pagesToProcess != null) && (!pagesToProcess.contains(pageNumber + 1))) {
                continue;
            }
            PDPage page = (PDPage) pages.get(pageNumber);
            PDResources resources = page.getResources();
            Map xobjs = resources.getXObjects();

            if (xobjs != null) {
                Iterator xobjIter = xobjs.keySet().iterator();
                while (xobjIter.hasNext()) {
                    String key = (String) xobjIter.next();
                    PDXObject xobj = (PDXObject) xobjs.get(key);
                    Map images;
                    if (xobj instanceof PDXObjectForm) {
                        PDXObjectForm xform = (PDXObjectForm) xobj;
                        images = xform.getResources().getImages();
                    } else {
                        images = resources.getImages();
                    }

                    // reading images from each page and saving them to file
                    if (images != null) {
                        Iterator imageIter = images.keySet().iterator();
                        while (imageIter.hasNext()) {
                            String imKey = (String) imageIter.next();
                            PDXObjectImage image = (PDXObjectImage) images.get(imKey);

                            PDStream pdStr = new PDStream(image.getCOSStream());
                            List filters = pdStr.getFilters();

                            if (image.getBitsPerComponent() > 1) {
                                log.info("It is not a bitonal image => skipping");
                                continue;
                            }

                            // at this moment for preventing bad output (bad coloring) from LZWDecode filter
                            if (filters.contains(COSName.LZW_DECODE.getName())) {
                                log.info("This is LZWDecoded => skipping");
                                continue;

                            }

                            // detection of unsupported filters by pdfBox library
                            if (filters.contains("JBIG2Decode")) {
                                log.info("Allready compressed according to JBIG2 standard => skipping");
                                continue;
                            }
                            if (filters.contains("JPXDecode")) {
                                log.info("Unsupported filter JPXDecode => skipping");
                                continue;
                            }

                            COSObject cosObj = new COSObject(image.getCOSObject());
                            int objectNum = cosObj.getObjectNumber().intValue();
                            int genNum = cosObj.getGenerationNumber().intValue();
                            log.debug(objectNum + " " + genNum + " obj");

                            String name = getUniqueFileName(prefix + imKey, image.getSuffix());
                            log.debug("Writing image:" + name);
                            image.write2file(name);

                            PdfImageInformation pdfImageInfo = new PdfImageInformation(key, image.getWidth(),
                                    image.getHeight(), objectNum, genNum);
                            originalImageInformations.add(pdfImageInfo);
                            log.debug(pdfImageInfo.toString());

                            namesOfImages.add(name + "." + image.getSuffix());
                        }
                    }

                }
            }

        }
    } catch (IOException ex) {
        throw new PdfRecompressionException("Unable to parse PDF document", ex);
    } finally {
        if (doc != null) {
            try {
                doc.close();
            } catch (IOException ex) {
                throw new PdfRecompressionException(ex);
            }
        }
    }
}

From source file:de.csw.linkgenerator.plugin.lucene.textextraction.PDFTextExtractor.java

License:Apache License

public String getText(byte[] data) throws Exception {
    PDDocument pdfDocument = null;//from ww w  .j  a v  a2 s .c o  m
    try {
        PDFParser parser = new PDFParser(new ByteArrayInputStream(data));
        parser.parse();

        pdfDocument = parser.getPDDocument();

        Writer writer = new CharArrayWriter();
        PDFTextStripper stripper = new PDFTextStripper();
        stripper.writeText(pdfDocument, writer);

        return writer.toString();
    } finally {
        if (pdfDocument != null)
            pdfDocument.close();
    }
}

From source file:de.kp.ames.nlp.PdfEngine.java

License:Open Source License

/**
 * @param stream//from   www  . ja  va 2 s.  c  o m
 * @return
 */
public Set<String> pdfToText(InputStream stream) {

    try {

        ENStopwords stopwords = new ENStopwords();

        PDFParser parser = new PDFParser(stream);
        parser.parse();

        /* 
         * Build pdf stripper and extract text content
         */
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(parser.getPDDocument());

        Set<String> terms = new HashSet<String>();

        String[] phrases = text.split("\n");
        for (String phrase : phrases) {

            phrase = phrase.trim();
            if (phrase.length() == 0)
                continue;

            String[] words = phrase.split(" ");
            for (String word : words) {

                /* 
                 * Filter stopwords
                 */
                if (stopwords.isStopword(word))
                    continue;
                terms.add(word);

            }

        }

        return terms;

    } catch (Exception e) {
        e.printStackTrace();

    } finally {
    }

    return null;
}

From source file:IO.search.SearchWordFile.java

private void search(File scrFile, String word) {
    String[] arrStr = null;/*from   www  .  java  2 s. c  o  m*/
    String[] arrStrA = null;
    if (word.contains(" ")) {
        arrStr = word.split(" ");
    } else if (word.contains("-")) {
        arrStrA = word.split("-");
        System.out.println("reach");
    }

    boolean is03word = scrFile.getName().matches("^.+\\.(?i)(doc)$");
    if (is03word) {
        try {
            InputStream is = new FileInputStream(scrFile);
            WordExtractor ex = new WordExtractor(is);
            String text2003 = ex.getText();
            if (arrStr != null && arrStr.length > 0) {
                for (int i = 0; i < arrStr.length; i++) {
                    if (text2003.toLowerCase().contains(arrStr[i].toLowerCase())) {
                        nameList.add(scrFile.getPath());
                        return;
                    }
                }
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (text2003.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (text2003.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
            }
        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        }

    } else if (scrFile.getName().matches("^.+\\.(?i)(docx)$")) {
        try {
            OPCPackage opcPackage = POIXMLDocument.openPackage(scrFile.getPath());
            POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
            String text2007 = extractor.getText();
            if (arrStr != null && arrStr.length > 0) {
                for (int i = 0; i < arrStr.length; i++) {
                    if (text2007.toLowerCase().contains(arrStr[i].toLowerCase())) {
                        nameList.add(scrFile.getPath());
                        return;
                    }
                }
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (text2007.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (text2007.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());
            }
        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        }
    } else if (scrFile.getName().matches("^.+\\.(?i)(pdf)$")) {

        FileInputStream input = null;
        PDDocument pdfDocument = null;
        try {

            input = new FileInputStream(scrFile);
            PDFParser pdfParser = new PDFParser(input);
            pdfParser.parse();
            pdfDocument = pdfParser.getPDDocument();
            PDFTextStripper stripper = new PDFTextStripper();
            String content = stripper.getText(pdfDocument);
            if (arrStr != null && arrStr.length > 0) {
                for (int i = 0; i < arrStr.length; i++) {
                    if (content.toLowerCase().contains(arrStr[i].toLowerCase())) {
                        nameList.add(scrFile.getPath());
                        return;
                    }
                }
            } else if (arrStrA != null && arrStrA.length > 0) {
                int count = 0;
                for (int i = 0; i < arrStrA.length; i++) {
                    if (content.toLowerCase().contains(arrStrA[i].toLowerCase())) {
                        count++;
                    }
                }
                if (count == arrStrA.length) {
                    nameList.add(scrFile.getPath());
                }
            } else if (content.toLowerCase().contains(word.toLowerCase())) {
                System.out.println("true");
                nameList.add(scrFile.getPath());

            }

        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                input.close();
                pdfDocument.close();
            } catch (IOException ex) {
                Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    } else if (scrFile.getName().matches("^.+\\.(?i)(txt)$")) {
        BufferedReader in = null;
        try {
            in = new BufferedReader((new InputStreamReader(new FileInputStream(scrFile),
                    getCharset(scrFile.getAbsolutePath()))));
            String line = null;
            while ((line = in.readLine()) != null) {
                System.out.println(line);
                if (arrStr != null && arrStr.length > 0) {
                    for (int i = 0; i < arrStr.length; i++) {
                        if (line.toLowerCase().contains(arrStr[i].toLowerCase())) {
                            nameList.add(scrFile.getPath());
                            return;
                        }
                    }
                } else if (arrStrA != null && arrStrA.length > 0) {
                    int count = 0;
                    for (int i = 0; i < arrStrA.length; i++) {
                        if (line.contains(arrStrA[i])) {
                            count++;
                        }
                    }
                    if (count == arrStrA.length) {
                        nameList.add(scrFile.getPath());
                    }
                } else if (line.toLowerCase().contains(word.toLowerCase())) {
                    System.out.println("true");
                    nameList.add(scrFile.getPath());
                    return;
                }
            }
        } catch (Exception ex) {
            Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                in.close();
            } catch (IOException ex) {
                Logger.getLogger(SearchWordFile.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

    }
}