Example usage for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage

List of usage examples for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser PdfTextExtractor getTextFromPage.

Prototype

public static String getTextFromPage(PdfReader reader, int pageNumber) throws IOException 

Source Link

Document

Extract text from a specified page using the default strategy.

Usage

From source file:englishrusbook.Reader.java

public HashMap<Integer, String> getPageMap() {
    debugOut("function:getPDFPages");
    HashMap<Integer, String> map = new HashMap<>();

    for (int i = 1, pageCount = reader.getNumberOfPages(); i <= pageCount; i++) {
        try {//from w  w w. j a  v  a 2 s.c  om
            map.put(i, PdfTextExtractor.getTextFromPage(reader, i));
            debugOut("ShowPageOfNumber: " + String.valueOf(i));
        } catch (IOException e) {
            System.out.println("ERROR in reading: " + e);
        }
    }

    return map;
}

From source file:example.Cap1BackupCode.java

public static void main(String[] args) throws IOException {

    String line, prev = "";

    Example example = new Example();
    String[] exclusions = { "100550310 - Food Technology", "100551710 - Oil and Paints Technology",
            "100552410 - Paper and Pulp Technology", "100552710 - Petro Chemical Engineering" };
    //  File file=new File("/home/sachin/Downloads/2014ENGG_Cutoff_CAP1.txt");
    PdfReader reader = new PdfReader("/home/sachin/Downloads/2014ENGG_Cutoff_CAP1.pdf");
    //  System.out.println("This PDF has " + reader.getNumberOfPages() + " pages.");

    for (int i = 1; i < reader.getNumberOfPages(); i += 1) {
        String page = PdfTextExtractor.getTextFromPage(reader, i);
        InputStream is = new ByteArrayInputStream(page.getBytes());
        // read it with BufferedReader
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        int count = 1;
        while ((line = br.readLine()) != null) {
            if (count == 7) {

                //  System.out.println("substring is"+line.substring(6));
                //   System.out.println("Prev college is"+example.getPrevCollege());
                if (!line.substring(6).trim().equals(example.getPrevCollege())) {
                    // System.out.println("College Name" + line);
                    if (example.getPrevCollege() != "") {
                        example.addObject(example.getSinglecollegedata());
                    }//  ww w. ja  v  a2s  . c  o m
                    example.setPrevCollege(line.substring(6).trim());
                    CollegeDataObject object = new CollegeDataObject();
                    object.setCollegeName(line.substring(6).trim());
                    int temp;
                    if ((temp = line.substring(6).trim().split(",").length) > 1) {
                        object.setCity(line.substring(6).trim().split(",")[temp - 1].trim());
                        object.setCollegeName(line.substring(6).trim()
                                .replace(line.substring(6).trim().split(",")[temp - 1].trim(), "").trim()
                                .replace(",", "").trim());
                    }
                    example.setSinglecollegedata(object);
                } else {

                }
            }
            if (count == 8) {

                // System.out.print("Branch name" + line);
                if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3")
                        && !line.contains("OBC")) {
                    if (example.getPrevBranch() != "") {
                        example.getSinglecollegedata().AddObject(example.getCastDataObject());
                        CastDataObject object = new CastDataObject();
                        object.setBranchName(line.substring(11).trim());
                        example.setPrevBranch(line.substring(11).trim());
                        example.setCastDataObject(object);
                    } else {
                        CastDataObject object = new CastDataObject();
                        object.setBranchName(line.substring(11).trim());
                        example.setCastDataObject(object);
                        example.setPrevBranch(line.substring(11).trim());
                    }
                } else {
                    break;
                }
            }
            if (count == 11) {
                // System.out.println("open rank"+line);
                if (isNumeric(line.split(" ")[0])) {
                    if (line.split(" ")[0].trim().equals(null))
                        line = "0";
                    example.getCastDataObject().setOpen(line.split(" ")[0].trim());
                }
            }
            if (count == 14) {
                //   System.out.println("sc rank is"+line);
                if (isNumeric(line)) {
                    if (line.trim().equals(null))
                        line = "0";
                    example.getCastDataObject().setSc(line.trim());
                }

            }
            if (count == 17) {
                // System.out.println("st rank is"+line);
                if (isNumeric(line)) {
                    if (line.trim().equals(null))
                        line = "0";
                    example.getCastDataObject().setSt(line.trim());

                }
            }
            if (count == 20) {
                if (isNumeric(line)) {
                    if (line.trim().equals(null))
                        line = "0";
                    if (prev.equals("GVJO")) {
                        //   System.out.println("VJ rank is"+line);
                        example.getCastDataObject().setVj(line.trim());
                        line = br.readLine();
                        line = br.readLine();
                        line = br.readLine();
                        count += 3;
                        //   System.out.println("NT1 rank is"+line);
                        example.getCastDataObject().setNt1(line.trim());

                    } else {
                        //   System.out.println("NT1 rank is" + line);
                        example.getCastDataObject().setNt1(line.trim());
                        count += 3;
                    }
                }

            }
            //            if (count==23)
            //            {
            //                System.out.println("NT1 rank is"+line);
            //            }
            if (count == 26) {

                if (isNumeric(line)) {
                    //  System.out.println("NT2 rank is"+line);
                    if (line.trim().equals(null))
                        line = "0";
                    example.getCastDataObject().setNt2(line.trim());
                }
            }
            if (count == 29) {

                if (prev.equals("GNT3O")) {
                    //   System.out.println("NT3 rank is" + line);
                    if (isNumeric(line)) {
                        if (line.trim().equals(null))
                            line = "0";
                        example.getCastDataObject().setNt3(line.trim());
                        line = br.readLine();
                        line = br.readLine();
                        line = br.readLine();
                        //    System.out.println("OBC rank is" + line);
                    }
                    if (isNumeric(line.split(" ")[0])) {
                        if (line.split(" ")[0].trim().equals(null))
                            line = "0";
                        example.getCastDataObject().setOBC(line.split(" ")[0].trim());
                    }
                } else {
                    //    System.out.println("OBC rank is" + line);
                    if (isNumeric(line.split(" ")[0])) {
                        if (line.split(" ")[0].trim().equals(null))
                            line = "0";
                        example.getCastDataObject().setOBC(line.split(" ")[0].trim());
                    }
                }

            }
            count++;
            prev = line;
        }
        //                if (count==7)
        //                {
        //                    if (!line.substring(6).equals(example.getPrevCollege()))
        //                    System.out.println("College name is"+line.substring(6));
        //                    example.setPrevCollege(line.substring(6));
        //
        //                }
        //                if (count==8)
        //                {
        //                    if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3") && !line.contains("OBC"))
        //                    System.out.println("Branch name is"+line.substring(12));
        //                }
        //                count++;
        // System.out.println(line);
    }

    //example.getSinglecollegedata().AddObject(example.getCastDataObject());
    example.getSinglecollegedata().AddObject(example.getCastDataObject());
    example.addObject(example.getSinglecollegedata());

    //  System.out.println("Size is" + example.getObject().size());

    //        for (CollegeDataObject collegeDataObject :example.getObject()) {
    //            System.out.println("College Name is " + collegeDataObject.getCollegeName());
    //            for (CastDataObject object : collegeDataObject.getObjects()) {
    //                System.out.println("Branch name is " + object.getBranchName());
    //               // if (isNumeric(object.getOpen()))
    //                System.out.println("Open rank is " + object.getOpen());
    //               // if (isNumeric(object.getSc()))
    //                System.out.println("SC rank is " + object.getSc());
    //              //  if (isNumeric(object.getSt()))
    //                System.out.println("ST rank is " + object.getSt());
    //              //  if (isNumeric(object.getVj()))
    //                System.out.println("VJ rank is " + object.getVj());
    //              //  if (isNumeric(object.getNt1()))
    //                System.out.println("NT1 rank is " + object.getNt1());
    //              //  if (isNumeric(object.getNt2()))
    //                System.out.println("NT2 rank is " + object.getNt2());
    //              //  if (isNumeric(object.getNt3()))
    //                System.out.println("NT3 rank is " + object.getNt3());
    //              //  if (isNumeric(object.getOBC()))
    //                System.out.println("OBC rank is " + object.getOBC());
    //            }
    //        }

    //BufferedReader reader=new Bufferef ll)
    //{
    //   System.out.println(line);
    //}

    MongoClient mongo1 = new MongoClient("localhost");
    MongoDatabase db = mongo1.getDatabase("CollegeFinder");
    MongoCollection<Document> coll = db.getCollection("project");
    for (CollegeDataObject collegeDataObject : example.getObject()) {
        Document college = new Document();
        college.append("college_name", collegeDataObject.getCollegeName());
        college.append("city", collegeDataObject.getCity());
        // System.out.println(collegeDataObject.getCollegeName() + " City "+collegeDataObject.getCity());
        List<Document> branches = new ArrayList<>();
        for (CastDataObject object : collegeDataObject.getObjects()) {
            Document branch = new Document();
            branch.append("branch_name", object.getBranchName());
            branch.append("open", object.getOpen());
            branch.append("sc", object.getSc());
            branch.append("st", object.getSt());
            branch.append("vj", object.getVj());
            branch.append("nt1", object.getNt1());
            branch.append("nt2", object.getNt2());
            branch.append("nt3", object.getNt3());
            branch.append("obc", object.getOBC());
            branches.add(branch);
        }
        college.append("Branch", branches);
        coll.insertOne(college);
    }
    //                m.append("name","sachin");
    //                m.append("year", "first year");
    //                m.append("branch", "seond year");
    //                List <Document> list=new ArrayList<>();
    //                list.add(m);
    //                Document s=new Document();
    //                s.append("name", "Arjun");
    //                s.append("year", "first year");
    //                s.append("branch", "seond year");
    //                list.add(s);
    //                Document parent=new Document();
    //                parent.append("embed", list);
    //                coll.insertOne(parent);

    //              p  
    //                List <Document> list=new ArrayList<>();
    //                for (int i=0;i<10;i++)
    //                {
    //                    list.add(new Document(m));
    //                }
    //                coll.insertMany(list);
    //     coll.insertOne(parent);

    // TODO code application logic here

    //              coll.find();
}

From source file:example.Example.java

public static void main(String[] args) throws IOException {

    String line, prev = "";

    Example example = new Example();
    String[] exclusions = { "100550310 - Food Technology", "100551710 - Oil and Paints Technology",
            "100552410 - Paper and Pulp Technology", "100552710 - Petro Chemical Engineering" };
    //  File file=new File("/home/sachin/Downloads/2014ENGG_Cutoff_CAP1.txt");
    PdfReader reader = new PdfReader("/home/sachin/Downloads/2014ENGG_Cutoff_CAP2.pdf");
    //  System.out.println("This PDF has " + reader.getNumberOfPages() + " pages.");

    for (int i = 1; i < reader.getNumberOfPages(); i += 1) {
        String page = PdfTextExtractor.getTextFromPage(reader, i);
        InputStream is = new ByteArrayInputStream(page.getBytes());
        // read it with BufferedReader
        BufferedReader br = new BufferedReader(new InputStreamReader(is));
        int count = 1;
        while ((line = br.readLine()) != null) {
            if (count == 7) {

                //  System.out.println("substring is"+line.substring(6));
                //   System.out.println("Prev college is"+example.getPrevCollege());
                if (!line.substring(6).trim().equals(example.getPrevCollege())) {
                    // System.out.println("College Name" + line);
                    if (example.getPrevCollege() != "") {
                        example.addObject(example.getSinglecollegedata());
                    }/*w ww .j a v  a2 s . com*/
                    example.setPrevCollege(line.substring(6).trim());
                    CollegeDataObject object = new CollegeDataObject();
                    object.setCollegeName(line.substring(6).trim());
                    int temp;
                    if ((temp = line.substring(6).trim().split(",").length) > 1) {
                        object.setCity(line.substring(6).trim().split(",")[temp - 1].trim());
                        object.setCollegeName(line.substring(6).trim()
                                .replace(line.substring(6).trim().split(",")[temp - 1].trim(), "").trim()
                                .replace(",", "").trim());
                    }
                    example.setSinglecollegedata(object);
                } else {

                }
            }
            if (count == 8) {

                // System.out.print("Branch name" + line);
                if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3")
                        && !line.contains("OBC")) {
                    if (example.getPrevBranch() != "") {
                        example.getSinglecollegedata().AddObject(example.getCastDataObject());
                        CastDataObject object = new CastDataObject();
                        object.setBranchName(line.substring(11).trim());
                        example.setPrevBranch(line.substring(11).trim());
                        example.setCastDataObject(object);
                    } else {
                        CastDataObject object = new CastDataObject();
                        object.setBranchName(line.substring(11).trim());
                        example.setCastDataObject(object);
                        example.setPrevBranch(line.substring(11).trim());
                    }
                } else {
                    break;
                }
            }
            if (count == 10) {
                // System.out.println("open rank"+line);

                if (line.contains("GOPENO") || line.contains("GOPENH")) {
                    //                        if (isNumeric(line)) {
                    //                            if (line.trim().equals(null))
                    //                                line = "0";
                    if (line.split(" ").length > 1) {
                        if (isNumeric(line.split(" ")[1].trim()))
                            example.getCastDataObject().setOpen(line.split(" ")[1].trim());
                    } else {
                        line = br.readLine();
                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setOpen(line.split(" ")[1].trim());
                        } else {
                            if (isNumeric(line.trim()))
                                example.getCastDataObject().setOpen(line.trim());
                        }
                    }
                    // }
                }

                while ((line = br.readLine()) != null) {
                    if (line.contains("GSCO") || line.contains("GSCH")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";
                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setSc(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setSc(line.split(" ")[1].trim());
                            } else {
                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setSc(line.trim());
                            }
                        }

                        //}
                    }
                    if (line.contains("GSTO") || line.contains("GSTH")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";
                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setSt(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setSt(line.split(" ")[1].trim());
                            } else {
                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setSt(line.trim());
                            }
                        }
                        //}
                    }
                    if (line.contains("GVJO") || line.contains("GVJH")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";

                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setVj(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setVj(line.split(" ")[1].trim());
                            } else {
                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setVj(line.trim());
                            }
                        }
                        //System.out.println("Line is"+line.trim());

                        //  }
                    }
                    if (line.contains("GNT1O") || line.contains("GNT1H")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";
                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setNt1(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setNt1(line.split(" ")[1].trim());
                            } else {

                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setNt1(line.trim());
                            }
                        }
                        //                            }

                    }
                    if (line.contains("GNT2O") || line.contains("GNT2H")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";

                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setNt2(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setNt2(line.split(" ")[1].trim());
                            } else {
                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setNt2(line.trim());
                            }
                        }
                        //                            }

                    }
                    if (line.contains("GNT3O") || line.contains("GNT3H")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";
                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setNt3(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setNt3(line.split(" ")[1].trim());
                            } else {
                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setNt3(line.trim());
                            }
                        }
                        //   }

                    }
                    if (line.contains("GOBCO") || line.contains("GOBCH")) {
                        //                            if (isNumeric(line)) {
                        //                                if (line.trim().equals(null))
                        //                                    line = "0";
                        if (line.split(" ").length > 1) {
                            if (isNumeric(line.split(" ")[1].trim()))
                                example.getCastDataObject().setOBC(line.split(" ")[1].trim());
                        } else {
                            line = br.readLine();
                            if (line.split(" ").length > 1) {
                                if (isNumeric(line.split(" ")[1].trim()))
                                    example.getCastDataObject().setOBC(line.split(" ")[1].trim());
                            } else {
                                if (isNumeric(line.trim()))
                                    example.getCastDataObject().setOBC(line.trim());
                            }
                        }
                        //    }
                    }

                }

            }
            //                if (count == 14) {
            //                    //   System.out.println("sc rank is"+line);
            //                    if (isNumeric(line)) {
            //                        if (line.trim().equals(null))
            //                            line = "0";
            //                        example.getCastDataObject().setSc(line.trim());
            //                    }
            //
            //
            //                }
            //                if (count == 17) {
            //                    // System.out.println("st rank is"+line);
            //                    if (isNumeric(line)) {
            //                        if (line.trim().equals(null))
            //                            line = "0";
            //                        example.getCastDataObject().setSt(line.trim());
            //
            //                    }
            //                }
            //                if (count == 20) {
            //                    if (isNumeric(line)) {
            //                        if (line.trim().equals(null))
            //                            line = "0";
            //                        if (prev.equals("GVJO")) {
            //                            //   System.out.println("VJ rank is"+line);
            //                            example.getCastDataObject().setVj(line.trim());
            //                            line = br.readLine();
            //                            line = br.readLine();
            //                            line = br.readLine();
            //                            count += 3;
            //                            //   System.out.println("NT1 rank is"+line);
            //                            example.getCastDataObject().setNt1(line.trim());
            //
            //                        } else {
            //                            //   System.out.println("NT1 rank is" + line);
            //                            example.getCastDataObject().setNt1(line.trim());
            //                            count += 3;
            //                        }
            //                    }
            //
            //                }
            //            if (count==23)
            //            {
            //                System.out.println("NT1 rank is"+line);
            //            }
            //                if (count == 26) {
            //
            //                    if (isNumeric(line)) {
            //                        //  System.out.println("NT2 rank is"+line);
            //                        if (line.trim().equals(null))
            //                            line="0";
            //                        example.getCastDataObject().setNt2(line.trim());
            //                    }
            //                }
            //                if (count == 29) {
            //
            //                        if (prev.equals("GNT3O")) {
            //                            //   System.out.println("NT3 rank is" + line);
            //                            if (isNumeric(line)) {
            //                                if (line.trim().equals(null))
            //                                    line="0";
            //                                example.getCastDataObject().setNt3(line.trim());
            //                                line = br.readLine();
            //                                line = br.readLine();
            //                                line = br.readLine();
            //                                //    System.out.println("OBC rank is" + line);
            //                            }
            //                            if (isNumeric(line.split(" ")[0])) {
            //                                if (line.split(" ")[0].trim().equals(null))
            //                                    line="0";
            //                                example.getCastDataObject().setOBC(line.split(" ")[0].trim());
            //                            }
            //                        } else {
            //                            //    System.out.println("OBC rank is" + line);
            //                            if (isNumeric(line.split(" ")[0])) {
            //                                if (line.split(" ")[0].trim().equals(null))
            //                                    line = "0";
            //                                example.getCastDataObject().setOBC(line.split(" ")[0].trim());
            //                            }
            //                        }

            //               }
            count++;
            prev = line;
        }
    }
    //                if (count==7)
    //                {
    //                    if (!line.substring(6).equals(example.getPrevCollege()))
    //                    System.out.println("College name is"+line.substring(6));
    //                    example.setPrevCollege(line.substring(6));
    //
    //                }
    //                if (count==8)
    //                {
    //                    if (!line.contains("NT1") && !line.contains("NT2") && !line.contains("NT3") && !line.contains("OBC"))
    //                    System.out.println("Branch name is"+line.substring(12));
    //                }
    //                count++;
    //                System.out.println(line);
    //            }
    //        }

    //example.getSinglecollegedata().AddObject(example.getCastDataObject());
    example.getSinglecollegedata().AddObject(example.getCastDataObject());
    example.addObject(example.getSinglecollegedata());

    //  System.out.println("Size is" + example.getObject().size());

    //        for (CollegeDataObject collegeDataObject :example.getObject()) {
    //            System.out.println("College Name is " + collegeDataObject.getCollegeName());
    //            for (CastDataObject object : collegeDataObject.getObjects()) {
    //                System.out.println("Branch name is " + object.getBranchName());
    //               // if (isNumeric(object.getOpen()))
    //                System.out.println("Open rank is " + object.getOpen());
    //               // if (isNumeric(object.getSc()))
    //                System.out.println("SC rank is " + object.getSc());
    //              //  if (isNumeric(object.getSt()))
    //                System.out.println("ST rank is " + object.getSt());
    //              //  if (isNumeric(object.getVj()))
    //                System.out.println("VJ rank is " + object.getVj());
    //              //  if (isNumeric(object.getNt1()))
    //                System.out.println("NT1 rank is " + object.getNt1());
    //              //  if (isNumeric(object.getNt2()))
    //                System.out.println("NT2 rank is " + object.getNt2());
    //              //  if (isNumeric(object.getNt3()))
    //                System.out.println("NT3 rank is " + object.getNt3());
    //              //  if (isNumeric(object.getOBC()))
    //                System.out.println("OBC rank is " + object.getOBC());
    //            }
    //        }

    //BufferedReader reader=new Bufferef ll)
    //{
    //   System.out.println(line);
    //}

    MongoClient mongo1 = new MongoClient("localhost");
    MongoDatabase db = mongo1.getDatabase("CollegeFinder");
    MongoCollection<Document> coll = db.getCollection("cap_round2");
    for (CollegeDataObject collegeDataObject : example.getObject()) {
        Document college = new Document();
        college.append("college_name", collegeDataObject.getCollegeName());
        college.append("city", collegeDataObject.getCity());
        System.out.println(collegeDataObject.getCollegeName() + " City " + collegeDataObject.getCity());
        List<Document> branches = new ArrayList<>();
        for (CastDataObject object : collegeDataObject.getObjects()) {
            Document branch = new Document();
            branch.append("branch_name", object.getBranchName());
            branch.append("open", Integer.valueOf(object.getOpen()));
            branch.append("sc", Integer.valueOf(object.getSc()));
            branch.append("st", Integer.valueOf(object.getSt()));
            branch.append("vj", Integer.valueOf(object.getVj()));
            branch.append("nt1", Integer.valueOf(object.getNt1()));
            branch.append("nt2", Integer.valueOf(object.getNt2()));
            branch.append("nt3", Integer.valueOf(object.getNt3()));
            branch.append("obc", Integer.valueOf(object.getOBC()));
            branches.add(branch);
        }
        college.append("Branch", branches);
        coll.insertOne(college);
    }
    //                m.append("name","sachin");
    //                m.append("year", "first year");
    //                m.append("branch", "seond year");
    //                List <Document> list=new ArrayList<>();
    //                list.add(m);
    //                Document s=new Document();
    //                s.append("name", "Arjun");
    //                s.append("year", "first year");
    //                s.append("branch", "seond year");
    //                list.add(s);
    //                Document parent=new Document();
    //                parent.append("embed", list);
    //                coll.insertOne(parent);

    //              p
    //                List <Document> list=new ArrayList<>();
    //                for (int i=0;i<10;i++)
    //                {
    //                    list.add(new Document(m));
    //                }
    //                coll.insertMany(list);
    //     coll.insertOne(parent);

    // TODO code application logic here

    //              coll.find();

}

From source file:freemind.modes.ControllerAdapter.java

License:Open Source License

public void pdf2img(String filePath, String fileName) throws IOException {
    PdfReader reader = new PdfReader(filePath);
    int page = reader.getNumberOfPages();
    String tmp[];/*from w w w.  ja  v a  2  s . c  o  m*/
    String temp[];
    ArrayList<SlideData> slideList;
    String newLine[];
    slideList = getController().getSlideList();
    SlideData sData = null;
    String tmpStr = "";
    String data = "";
    String oldStr = "";
    String mkDirPath;
    int noTitle = 0;
    int imgNum = 0;
    int tmpNum = 0;
    boolean dupChk;
    boolean noTitleChk;
    SlideData prev = null;

    mkDirPath = filePath.substring(0, filePath.indexOf(fileName.toString()));
    mkDirPath = mkDirPath + fileName.substring(0, fileName.indexOf(".pdf"));
    File mkDirFile = new File(mkDirPath);

    if (!mkDirFile.exists())
        mkDirFile.mkdir();

    mkDirPath += "\\";

    File file = new File(filePath);

    RandomAccessFile raf = new RandomAccessFile(file, "r");
    FileChannel channel = raf.getChannel();
    ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size());

    for (int i = 1; i <= page; i++) {
        String str = PdfTextExtractor.getTextFromPage(reader, i);
        System.out.flush();

        if (i == 1) {
            newLine = str.split("\n");
            sData = new SlideData();

            sData.setNodeName(newLine[0]);
            sData.setImgPath(mkDirPath);
            slideList.add(sData);
            prev = sData;
        }

        if (str.indexOf("<<table of contents>>") >= 0) {
            templateChk = true;

            newLine = str.split("\n");
            for (int j = 1; j < newLine.length - 1; j++) {
                data = "";
                sData = new SlideData();

                tmp = newLine[j].split(" ");
                temp = tmp[0].split("\\.");

                for (int l = 0; l < temp.length; l++) // idx  = depth 
                    sData.getIdxList().add(Integer.parseInt(temp[l]));

                for (int k = 1; k < tmp.length; k++) {
                    data += tmp[k]; // 1 1.   // ,   
                    if (k != tmp.length - 1 && !tmp[k].equals(""))
                        data += " ";
                }
                sData.setImgPath(prev.getImgPath());
                sData.setNodeName(data);
                sData.setPrev(prev);
                prev.setNext(sData);
                prev = prev.getNext();
                slideList.add(sData);
            }
            break;
        }
    }

    //    sList 
    if (!templateChk) {
        slideList.clear(); // template  
        for (int i = 1; i <= page; i++) {
            String str = PdfTextExtractor.getTextFromPage(reader, i);
            System.out.flush();

            if (i == 1) {
                newLine = str.split("\n");
                sData = new SlideData();

                sData.setNodeName(newLine[0]);
                sData.setImgPath(mkDirPath);
                slideList.add(sData);
            } else {
                newLine = str.split("\n");
                dupChk = false;
                for (int j = 0; j < slideList.size(); j++) {
                    sData = slideList.get(j);
                    if (newLine[0].equals(sData.getNodeName())) { //     
                        dupChk = true;
                        break;
                    }
                }
                if (!dupChk) {
                    sData = new SlideData(); //   
                    sData.setNodeName(newLine[0]);
                    slideList.add(sData);
                }
            }
        }

        //  
        for (int i = 1; i <= page; i++) {
            String str = PdfTextExtractor.getTextFromPage(reader, i);
            System.out.flush();
            data = "";
            newLine = str.split("\n");

            data = newLine[0].replace(" ", "");

            for (int j = 0; j < slideList.size(); j++) {
                sData = slideList.get(j);
                tmpStr = sData.getNodeName().replace(" ", "");
                if (data.equals(tmpStr)) {
                    sData.setImgCnt(sData.getImgCnt() + 1);
                    break;
                }
            }
        }
    } else { // template 

        for (int i = 1; i <= page; i++) {
            String str = PdfTextExtractor.getTextFromPage(reader, i);
            System.out.flush();
            data = "";
            newLine = str.split("\n");

            tmp = newLine[0].split(" ");

            for (int k = 0; k < tmp.length; k++)
                data += tmp[k];

            data = data.replace(" ", "");

            for (int j = 0; j < slideList.size(); j++) {
                sData = slideList.get(j);
                tmpStr = sData.getNodeName().replace(" ", "");
                if (data.equals(tmpStr)) {
                    sData.setImgCnt(sData.getImgCnt() + 1);
                    break;
                }
            }
        }
    }

    for (int i = 1; i <= page; i++) {
        data = "";
        noTitleChk = false;
        PDFFile pdffile = new PDFFile(buf);
        String str = PdfTextExtractor.getTextFromPage(reader, i);
        System.out.flush();

        newLine = str.split("\n");
        if (newLine[0].equals("<<table of contents>>"))
            continue;

        if (newLine[0].equals("")) {
            data += "undefined" + noTitle;
            imgNum = 1;
            noTitle++;
            noTitleChk = true;
        } else {

            data = newLine[0].replace(" ", "");

            for (int j = 0; j < slideList.size(); j++) {
                sData = slideList.get(j);

                tmpStr = sData.getNodeName().replace(" ", "");
                if (data.equals(tmpStr)) {
                    if (oldStr.equals(tmpStr))
                        break;
                    oldStr = data;
                    tmpNum = imgNum = sData.getImgCnt();
                    break;
                }
            }
        }

        // draw the first page to an image
        PDFPage pdfPage = pdffile.getPage(i);

        // get the width and height for the doc at the default zoom
        Rectangle rect = new Rectangle(0, 0, (int) pdfPage.getBBox().getWidth(),
                (int) pdfPage.getBBox().getHeight());

        // generate the image

        Image image = pdfPage.getImage(rect.width, rect.height, // width
                // &
                // height
                rect, // clip rect
                null, // null for the ImageObserver
                true, // fill background with white
                true // block until drawing is done
        );

        int w = image.getWidth(null);
        int h = image.getHeight(null);
        BufferedImage bi = new BufferedImage(w, h, BufferedImage.TYPE_INT_RGB);
        Graphics2D g2 = bi.createGraphics();
        g2.drawImage(image, 0, 0, null);
        g2.dispose();
        try {
            if (imgNum == 1) {
                if (noTitleChk)
                    ImageIO.write(bi, "jpg", new File(mkDirPath + data + ".jpg"));
                else
                    ImageIO.write(bi, "jpg", new File(mkDirPath + sData.getNodeName() + ".jpg"));

            } else {
                ImageIO.write(bi, "jpg",
                        new File(mkDirPath + sData.getNodeName() + (imgNum - tmpNum) + ".jpg"));
                tmpNum--;
            }
        } catch (IOException ioe) {
            System.out.println("write: " + ioe.getMessage());
        }
        data = "";
    }
    sData = slideList.get(0);
    sData.setsCnt(page);
    reader.close();
}

From source file:freemind.modes.ControllerAdapter.java

License:Open Source License

public void pdf2mm(String filePath, String fileName) throws IOException {
    int depth = 0;
    String tmp[];/* w w w  . ja v  a 2s. co m*/
    String newLine[];
    String direction = "left";
    ArrayList<TableData> root = new ArrayList<TableData>();
    TableData oldTableData = new TableData();
    String mmFilePath = filePath.substring(0, filePath.length() - 4);
    fileName = fileName.substring(0, fileName.length() - 4);
    mmFilePath += ".mm";
    File mmFile = new File(mmFilePath);
    OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(mmFile), "UTF-8");
    ArrayList<SlideData> slideList;
    slideList = getController().getSlideList();
    String imgPath = slideList.get(0).getImgPath();
    try {
        PdfReader reader = new PdfReader(filePath);
        int page = reader.getNumberOfPages();

        for (int i = 1; i <= page; i++) {
            String str = PdfTextExtractor.getTextFromPage(reader, i);
            System.out.flush();

            if (str.indexOf("<<table of contents>>") >= 0) {
                out.write("<map version=\"0.9.0\">\n");

                newLine = str.split("\n");
                for (int j = 1; j < newLine.length - 1; j++) {
                    String data = "";
                    String hData = "";
                    TableData childTable = new TableData();

                    if (j > (newLine.length / 2))
                        direction = "right";

                    tmp = newLine[j].split("\\.");

                    if (tmp[1].substring(0, 1).equals(" ")) {
                        depth = 0;
                        data = tmp[1].substring(tmp[1].indexOf(" "), tmp[1].length());
                        childTable.setDirection(direction);
                        childTable.setHeadline(tmp[0]);
                        childTable.setDepth(depth);
                    } else {
                        depth = tmp.length - 1;
                        childTable.setDepth(depth);

                        tmp = newLine[j].split(" ");
                        hData = tmp[0];
                        for (int k = 1; k < tmp.length; k++)
                            data += tmp[k] + " ";
                        childTable.setHeadline(hData);
                        data = data.substring(0, data.length() - 1);
                    }
                    childTable.setData(data);

                    String tmpStr = childTable.getHeadline().toString();
                    if (tmpStr.length() > 2) {
                        if (tmpStr.substring(0, tmpStr.length() - 2).equals(oldTableData.getHeadline()))
                            oldTableData.setHaveChild(true);
                    }

                    if (j > 1)
                        root.add(oldTableData);

                    oldTableData = childTable;

                    if (j == newLine.length - 2) { //    
                        childTable.setHaveChild(false);
                        root.add(childTable);
                    }
                }
                break;
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    out.write("<node CREATED=\"1365038113483\" ID=\"ID_1002961678\" " + "MODIFIED=\"1365038132371\" "
            + "TEXT=\"" + fileName + "\">\n");
    TableData showTable;
    int dif;
    for (int i = 0; i < root.size(); i++) {
        out.write("<node CREATED=\"1365038113483\" ID=\"ID_1002961678\" MODIFIED=\"1365038132371\" ");
        showTable = root.get(i);
        if (!showTable.getDirection().equals(""))
            out.write("POSITION=\"" + showTable.getDirection() + "\" ");
        out.write("TEXT=\"" + showTable.getData().trim() + "\"");
        if (showTable.isHaveChild())
            out.write(">\n");
        else
            out.write("/>\n");
        if (i == root.size() - 1)
            dif = showTable.getDepth();
        else
            dif = showTable.getDepth() - root.get(i + 1).getDepth();
        for (int j = 0; j < dif; j++)
            out.write("</node>\n");
    }
    out.write("</node>\n</map>\n");
    out.close();
}

From source file:itextblast.ITextBlast.java

private static void processQAFile(String qa_filename, Boolean has_frontpage)
        throws IOException, DocumentException {

    // use one of the previous examples to create a PDF
    // new MovieTemplates().createPdf(MovieTemplates.RESULT);
    // Create a reader; from current existing file
    // Next time pass it from args ..
    PdfReader reader = new PdfReader(String.format(ITextBlast.working_dir + SOURCE, qa_filename));
    ITextBlast.my_reader = reader;// ww w . j a va  2s .co  m
    // We'll create as many new PDFs as there are pages
    // Document document;
    // PdfCopy copy;
    // loop over all the pages in the original PDF
    int n = reader.getNumberOfPages();
    // For test of extraction and regexp; use first 5 pages ..
    // n = 15;
    // Text Extraction Strategy here ...
    // LocationTextExtractionStrategy strategy = new LocationTextExtractionStrategy();
    // SimpleTextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
    // Both ^ does not work well; weird behavior ... no need so clever ..
    // START SMART Start Number ********
    Pattern smart_start_pattern;
    smart_start_pattern = Pattern.compile(".*?SOALAN.*?N.*?O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE);
    // Extract cover page number as smartly as possible??
    String cover_page_content = PdfTextExtractor.getTextFromPage(reader, 1);
    Matcher smart_start_matcher = smart_start_pattern.matcher(cover_page_content);
    String smart_start_question_number = null;
    if (smart_start_matcher.find()) {
        // Extract the question number based on backreference
        smart_start_question_number = smart_start_matcher.group(1);
        // How will it look when using a different strategy?
        out.println("Matched " + smart_start_matcher.group(0) + " and SMART Start Number: "
                + smart_start_question_number);
    }
    // END SMART Start Number ********
    Pattern liberal_found_question_pattern_uno;
    liberal_found_question_pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*", Pattern.CASE_INSENSITIVE);
    Pattern liberal_found_question_pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*", Pattern.CASE_INSENSITIVE);
    Pattern pattern_uno;
    // pattern = Pattern.compile("^.*NO.*SOALAN.*?(\\d+).*$", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
    // pattern = Pattern.compile(".*SOALAN.*?(\\d+).*", Pattern.CASE_INSENSITIVE);
    pattern_uno = Pattern.compile(".*N.*O.*SOALAN.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE);
    Pattern pattern_dos = Pattern.compile(".*SOALAN.*N.*O.*?(\\d+)\\b+.*", Pattern.CASE_INSENSITIVE);
    // OPTION 2 is to try with the next available number between word boundaries .. but may then need non-greedy ..
    // Init start and end page
    int start_page = 1;
    int end_page = 1;
    String question_number = "0-intro";

    // This is for SOALAN LISAN; which has no Front Page
    // the Start Question Number should then be set to SMART Start Number
    if (!has_frontpage) {
        question_number = smart_start_question_number;
    }

    for (int i = 1; i < n; i++) {
        // init found_question_number
        String found_question_number = null;
        boolean found_match = false;
        // PdfDictionary page = reader.getPageN(i);
        // use location based strategy
        out.println("Page " + i);
        out.println("===========");
        // out.println(PdfTextExtractor.getTextFromPage(reader, i, strategy));
        String content = PdfTextExtractor.getTextFromPage(reader, i);
        // DEBUG: Uncomment below ..
        // out.println(content);
        Matcher liberal_uno_matcher = liberal_found_question_pattern_uno.matcher(content);
        if (liberal_uno_matcher.find()) {
            out.println("Matched UNO!");
            found_match = true;
            Matcher matcher = pattern_uno.matcher(content);
            // Loop to find the digit; it is possible it is not found an dleft as null ..
            while (matcher.find()) {
                // Extract the question number based on backreference
                found_question_number = matcher.group(1);
                // How will it look when using a different strategy?
                out.println("Matched " + matcher.group(0) + " and Question Number: " + found_question_number);
            }
        } else if (liberal_found_question_pattern_dos.matcher(content).find()) {
            if ("0-intro".equals(question_number)) {
                out.println("SMART!!!");
            } else {
                found_match = true;
                out.println("Matched DOS!");
                Matcher matcher = pattern_dos.matcher(content);
                // Loop to find the digit; it is possible it is not found an dleft as null ..
                while (matcher.find()) {
                    // Extract the question number based on backreference
                    found_question_number = matcher.group(1);
                    // How will it look when using a different strategy?
                    out.println(
                            "Matched " + matcher.group(0) + " and Question Number: " + found_question_number);
                }

            }
        }
        // If matched; take out the last start, end 
        if (found_match) {
            // copy page over and write it down ..
            end_page = i - 1;
            if (end_page < 1) {
                end_page = 1;
            }
            if (null == found_question_number) {
                if ("0-intro".equals(question_number)) {
                    // After intro; if got problem; try the smart start
                    found_question_number = smart_start_question_number;
                    out.println("First question could not determine number; using Q No. => "
                            + found_question_number);
                    // Print out content to debug
                    out.println("*****DEBUG Content*******");
                    out.println(content);
                } else {
                    // otherwise; use current question and just append Unix timestamp ..
                    found_question_number = question_number + "_" + (System.currentTimeMillis() / 1000L);
                    out.println(
                            "Unexpectedly could not determine number; using Q No. => " + found_question_number);
                    // Print out content to debug
                    out.println("*****DEBUG Content*******");
                    out.println(content);
                }
            }
            // Write based on previous confirmed question_number
            ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number);
            // re-set to current page
            start_page = i;
            end_page = i;
            question_number = found_question_number;
        }
        // out.println(PdfTextExtractor.getTextFromPage(reader, i));
        // Pattern RegExp:  #^.*NO.*SOALAN.*(\d)+$#im
        out.println();
        out.println();
        // use helper file to dump out        
        // Look out for pattern  "NO. SOALAN"
        // Once see pattern or reach end; snip off copy from start to end
        // reset start/end
        // else increase the end
    }
    // If end of the loop there are still straglers; mark with the special question_number = 999
    if (start_page <= end_page) {
        // Should always happen actually ..
        ITextBlast.copySelectedQuestionPage(start_page, end_page, question_number);
    }
    reader.close();
}

From source file:lecteur.Interface.java

private static String ReadPDF(String pdf_url) {
    String[] row;/*from ww  w .  j a  va 2s  .  c  om*/
    ArrayList<Bilan> tabRubriqueBilanTotal = new ArrayList<>();
    ArrayList<Bilan> tabRubriqueBilan = null;
    System.out.println("ReadPDF");

    StringBuilder str = new StringBuilder();

    try {
        PdfReader reader = new PdfReader(pdf_url);
        int nbpage = reader.getNumberOfPages();
        System.out.println("Nombre de page = " + nbpage);
        //Recherche page BILAN - ACTIF
        //Recherche page BILAN - PASSIF
        //Recherche page 
        //pour chaque page, lire ligne.
        for (int i = 22; i <= 22; i++) {
            //for(int i=1;i<=nbpage;i++) {
            String str2 = PdfTextExtractor.getTextFromPage(reader, i);
            //System.out.println("STR2 = " + str2);
            //System.out.println("===========================");

            row = null;
            //Concatener les pages :
            //str.append(str2);
            //System.out.println("STR = " + str);

            //Appel fonction split chaque ligne de la page.
            row = splitPage(str2);
            System.out.println();
            System.out.println("\nnb row  traiter = " + row.length);

            //recherche de correspondance AA, AF, ect..
            //Recherche deux majuscules suivis d'espaces et nombre/espace/nombre
            //TODO
            //Gerer les cas o il n'y a pas de chiffre
            String pattern1 = "[A-Z]{2}";
            String pattern = "[A-Z]{2}\\p{Space}+\\d+\\p{Space}?\\d+";
            for (int j = 0; j < row.length; j++) {
                System.out.println("\nLigne  traiter AVANT FCT: " + row[j]);
                //TAB_BILAN par ligne
                tabRubriqueBilan = recherchebilan(row[j], pattern);
                for (int k = 0; k < tabRubriqueBilan.size(); k++) {
                    tabRubriqueBilanTotal.add(tabRubriqueBilan.get(k));
                }
            }

            for (int j = 0; j < tabRubriqueBilanTotal.size(); j++) {
                tabRubriqueBilanTotal.get(j).show();
            }

            System.out.println("TAILLE TAB PAR PAGE = " + tabRubriqueBilanTotal.size());
        }
        afficheResultat(tabRubriqueBilanTotal);

    } catch (Exception err) {
        err.printStackTrace();
    }
    return String.format("%s", str);
}

From source file:no.dusken.aranea.admin.control.issue.IssueIndexer.java

License:Apache License

private IssuePage getIssuePage(int pageNumber, PdfReader pdfReader, Issue issue) throws IOException {
    String textInPage = PdfTextExtractor.getTextFromPage(pdfReader, pageNumber);
    return new IssuePage(pageNumber, textInPage, issue);
}

From source file:org.karsha.document.PDFBookmark.java

License:Open Source License

public void parsePdf(PdfReader reader, String title, String preBokmarkTitle, int pageFrom, int pageTo)
        throws IOException {

    String pagetext = null;/*from w  w w . jav a 2s  .  c o  m*/

    String text = null;
    String txtPreSec;
    if (PdfTextExtractor.getTextFromPage(reader, pageFrom).contains(title)) {
        text = PdfTextExtractor.getTextFromPage(reader, pageFrom);
        // System.out.println(title + " contains at " + text.indexOf(title));
        txtPreSec = text.substring(0, text.indexOf(title));
        if (!txtPreSec.isEmpty()) {
            addToPreSec(preBokmarkTitle, txtPreSec);
        }
        pagetext = text.substring(text.indexOf(title));
        //  System.out.println(pagetext);
        pageFrom++;
    } else {
        // System.out.println("Can't find String " + title);
    }

    try {
        while (pageFrom < pageTo) {
            pagetext = pagetext + PdfTextExtractor.getTextFromPage(reader, pageFrom);
            pageFrom++;
        }
        //  System.out.println(pagetext);

        if (!pagetext.isEmpty()) {

            docsSeperated.put(title, pagetext);
        }
    } catch (Exception ex) {
        // log.error(ex.getMessage());
    }

}

From source file:Package1.MAIN.java

public void pdf_reader() {
    int value = 0;
    try {// ww  w . j  a  va 2 s .  c o m

        PdfReader reader = new PdfReader(dir);
        int p = reader.getNumberOfPages();
        String page = "";
        if (p <= 250) {
            for (int i = 1; i <= p; i++) {
                if (i == 1)
                    page = page + "\t\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n";
                else
                    page = page + "\t\n\n\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n";
                page = page + PdfTextExtractor.getTextFromPage(reader, i);
                value = (i * 100) / p;

                progressive_status(value);
            }
            //            jProgressBar1.setValue(value);
        } else {
            for (int i = 1; i <= 200; i++) {
                if (i == 1)
                    page = page + "\t\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n";
                else
                    page = page + "\t\n\n\n---------------\t\tPage: " + i + "\t\t---------------\n\n\n";
                page = page + PdfTextExtractor.getTextFromPage(reader, i);
                value = (i * 100) / p;

                progressive_status(value);
            }

            JOptionPane.showMessageDialog(null,
                    "Only 200 Page converted. \n Download pro version for full access. ", "Warning!!",
                    JOptionPane.PLAIN_MESSAGE);
        }

        jTextPane1.setText(page);
        jLabel1.setText("Total Pages: " + p);

    } catch (IOException ex) {
        Logger.getLogger(MAIN.class.getName()).log(Level.SEVERE, null, ex);
    }
}