Example usage for org.apache.commons.io.input BOMInputStream BOMInputStream

List of usage examples for org.apache.commons.io.input BOMInputStream BOMInputStream

Introduction

In this page you can find the example usage for org.apache.commons.io.input BOMInputStream BOMInputStream.

Prototype

public BOMInputStream(InputStream delegate, ByteOrderMark... boms) 

Source Link

Document

Constructs a new BOM InputStream that excludes the specified BOMs.

Usage

From source file:adept.io.Reader.java

/**
 * Reads specified XML file into a DOM object.
 *
 * @param path the path/*from  w w w .  j  a v a 2s .c  o  m*/
 * @return the document
 */
public org.w3c.dom.Document readXML(String path) {
    try {
        InputStream is = findStreamInClasspathOrFileSystem(path);
        BOMInputStream bis = new BOMInputStream(is, false);
        DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
        DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
        org.w3c.dom.Document doc = dBuilder.parse(bis);
        // TODO - might have backslashes.
        doc.setDocumentURI(path);
        return doc;
    } catch (Exception e) {
        //e.printStackTrace();
    }
    return null;
}

From source file:com.thetdgroup.TextExtractionAdapter.java

private ContentInformation processFile(File fileName) throws IOException {
    ContentInformation extractedContent = new ContentInformation();
    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    ///*from   w  ww.  jav a 2  s .  co m*/
    InputStream inputStream = null;
    BOMInputStream bomInputStream = null;

    try {
        inputStream = new FileInputStream(fileName);
        bomInputStream = new BOMInputStream(inputStream, false);

        contenthandler = new BodyContentHandler();
        metadata.set(Metadata.RESOURCE_NAME_KEY, fileName.getName());

        Parser parser = new AutoDetectParser();
        parser.parse(bomInputStream, contenthandler, metadata);
    } catch (Exception exception) {
        extractedContent.hasException();
        extractedContent.setException(exception.toString());
    } finally {
        if (bomInputStream != null) {
            bomInputStream.close();
        }

        if (inputStream != null) {
            inputStream.close();
        }
    }

    //
    //
    extractedContent.setImportedFileName(fileName.getName());

    if (contenthandler != null) {
        String content = contenthandler.toString().replace("\n", " ");
        extractedContent.setContentData(content);
    }

    if (metadata != null) {
        // CREATIVE COMMONS
        extractedContent.setLicenseLocation(metadata.get(Metadata.LICENSE_LOCATION));
        extractedContent.setLicenceURL(metadata.get(Metadata.LICENSE_URL));
        extractedContent.setWorkType(metadata.get(Metadata.WORK_TYPE));

        // DUBLIN CORE
        extractedContent.setContributor(metadata.get(Metadata.CONTRIBUTOR));
        extractedContent.setCoverage(metadata.get(Metadata.COVERAGE));
        extractedContent.setCreator(metadata.get(Metadata.CREATOR));
        extractedContent.setDate(metadata.get(Metadata.DATE));
        extractedContent.setDescription(metadata.get(Metadata.DESCRIPTION));
        extractedContent.setFormat(metadata.get(Metadata.FORMAT));
        extractedContent.setIdentifier(metadata.get(Metadata.IDENTIFIER));
        extractedContent.setLanguage(metadata.get(Metadata.LANGUAGE));
        extractedContent.setModified(metadata.get(Metadata.MODIFIED));
        extractedContent.setPublisher(metadata.get(Metadata.PUBLISHER));
        extractedContent.setRelation(metadata.get(Metadata.RELATION));
        extractedContent.setRights(metadata.get(Metadata.RIGHTS));
        extractedContent.setDublinSource(metadata.get(org.apache.tika.metadata.DublinCore.SOURCE));
        extractedContent.setSubject(metadata.get(Metadata.SUBJECT));
        extractedContent.setTitle(metadata.get(Metadata.TITLE));
        extractedContent.setType(metadata.get(Metadata.TYPE));

        // GEOGRAPHIC
        //extractedContent.setAltitude(metadata.get(Metadata.ALTITUDE));
        //extractedContent.setLatitude(metadata.get(Metadata.LATITUDE));
        //extractedContent.setLongitude(metadata.get(Metadata.LONGITUDE));

        // HTTP HEADERS
        extractedContent.setContentDisposition(metadata.get(Metadata.CONTENT_DISPOSITION));
        extractedContent.setContentEncoding(metadata.get(Metadata.CONTENT_ENCODING));
        extractedContent.setContentLanguage(metadata.get(Metadata.CONTENT_LANGUAGE));
        extractedContent.setContentLength(metadata.get(Metadata.CONTENT_LENGTH));
        extractedContent.setContentLocation(metadata.get(Metadata.CONTENT_LOCATION));
        extractedContent.setContentMD5(metadata.get(Metadata.CONTENT_MD5));
        extractedContent.setContentType(metadata.get(Metadata.CONTENT_TYPE));
        extractedContent.setLastModifier(metadata.get(Metadata.LAST_MODIFIED));
        extractedContent.setLocation(metadata.get(Metadata.LOCATION));

        // MESSAGE (EMAIL)
        //extractedContent.setMessageBCC(metadata.get(Metadata.MESSAGE_BCC));
        //extractedContent.setMessageCC(metadata.get(Metadata.MESSAGE_CC));
        //extractedContent.setMessageFrom(metadata.get(Metadata.MESSAGE_FROM));
        //extractedContent.setMessageRecipientAddress(metadata.get(Metadata.MESSAGE_RECIPIENT_ADDRESS));
        //extractedContent.setMessageTo(metadata.get(Metadata.MESSAGE_TO));

        // MS OFFICE
        extractedContent.setApplicationName(metadata.get(Metadata.APPLICATION_NAME));
        extractedContent.setApplicationVersion(metadata.get(Metadata.APPLICATION_VERSION));
        extractedContent.setAuthor(metadata.get(Metadata.AUTHOR));
        extractedContent.setCategory(metadata.get(Metadata.CATEGORY));
        extractedContent.setCharacterCount(metadata.get(Metadata.CHARACTER_COUNT));
        extractedContent.setCharacterCountWithSpace(metadata.get(Metadata.CHARACTER_COUNT_WITH_SPACES));
        extractedContent.setComments(metadata.get(Metadata.COMMENTS));
        extractedContent.setCompany(metadata.get(Metadata.COMPANY));
        extractedContent.setContentStatus(metadata.get(Metadata.CONTENT_STATUS));
        extractedContent.setCreationDate(metadata.get(Metadata.CREATION_DATE));
        extractedContent.setEditTime(metadata.get(Metadata.EDIT_TIME));
        extractedContent.setKeywords(metadata.get(Metadata.KEYWORDS));
        extractedContent.setLastAuthor(metadata.get(Metadata.LAST_AUTHOR));
        extractedContent.setLastPrinted(metadata.get(Metadata.LAST_PRINTED));
        extractedContent.setLastSaved(metadata.get(Metadata.LAST_SAVED));
        extractedContent.setLineCount(metadata.get(Metadata.LINE_COUNT));
        extractedContent.setManager(metadata.get(Metadata.MANAGER));
        extractedContent.setNotes(metadata.get(Metadata.NOTES));
        extractedContent.setPageCount(metadata.get(Metadata.PAGE_COUNT));
        extractedContent.setParagraphCount(metadata.get(Metadata.PARAGRAPH_COUNT));
        extractedContent.setPresentationFormat(metadata.get(Metadata.PRESENTATION_FORMAT));
        extractedContent.setRevisionNumber(metadata.get(Metadata.REVISION_NUMBER));
        extractedContent.setSecurity(metadata.get(Metadata.SECURITY));
        extractedContent.setSlideCount(metadata.get(Metadata.SLIDE_COUNT));
        extractedContent.setTemplate(metadata.get(Metadata.TEMPLATE));
        extractedContent.setTotalTime(metadata.get(Metadata.TOTAL_TIME));
        extractedContent.setVersion(metadata.get(Metadata.VERSION));
        extractedContent.setWordCount(metadata.get(Metadata.WORD_COUNT));

        // CLIMATEFORCAST
        //extractedContent.setClimateForcastAcknowledgement(metadata.get(org.apache.tika.metadata.ClimateForcast.ACKNOWLEDGEMENT));     
        //extractedContent.setClimateForcastCommandLine(metadata.get(org.apache.tika.metadata.ClimateForcast.COMMAND_LINE));     
        //extractedContent.setClimateForcastComment(metadata.get(org.apache.tika.metadata.ClimateForcast.COMMENT));     
        //extractedContent.setClimateForcastContact(metadata.get(org.apache.tika.metadata.ClimateForcast.CONTACT));     
        //extractedContent.setClimateForcastConvention(metadata.get(org.apache.tika.metadata.ClimateForcast.CONVENTIONS));     
        //extractedContent.setClimateForcastExperimentID(metadata.get(org.apache.tika.metadata.ClimateForcast.EXPERIMENT_ID));     
        //extractedContent.setClimateForcastHistory(metadata.get(org.apache.tika.metadata.ClimateForcast.HISTORY));     
        //extractedContent.setClimateForcastInstitution(metadata.get(org.apache.tika.metadata.ClimateForcast.INSTITUTION));     
        //extractedContent.setClimateForcastModelName(metadata.get(org.apache.tika.metadata.ClimateForcast.MODEL_NAME_ENGLISH));     
        //extractedContent.setClimateForcastProgramID(metadata.get(org.apache.tika.metadata.ClimateForcast.PROGRAM_ID));     
        //extractedContent.setClimateForcastProjectID(metadata.get(org.apache.tika.metadata.ClimateForcast.PROJECT_ID));     
        //extractedContent.setClimateForcastRealization(metadata.get(org.apache.tika.metadata.ClimateForcast.REALIZATION));     
        //extractedContent.setClimateForcastReferences(metadata.get(org.apache.tika.metadata.ClimateForcast.REFERENCES));     
        //extractedContent.setClimateForcastSource(metadata.get(org.apache.tika.metadata.ClimateForcast.SOURCE));     
        //extractedContent.setClimateForcastTableID(metadata.get(org.apache.tika.metadata.ClimateForcast.TABLE_ID));     

        // TIFF
        //extractedContent.setTIFFBitsPerSample(metadata.get(Metadata.BITS_PER_SAMPLE));
        //extractedContent.setTIFFEquipmentMake(metadata.get(Metadata.EQUIPMENT_MAKE));
        //extractedContent.setTIFFEquipmentModel(metadata.get(Metadata.EQUIPMENT_MODEL));
        //extractedContent.setTIFFExposureLimit(metadata.get(Metadata.EXPOSURE_TIME));
        //extractedContent.setTIFFFNumber(metadata.get(Metadata.F_NUMBER));
        //extractedContent.setTIFFFlashFired(metadata.get(Metadata.FLASH_FIRED));
        //extractedContent.setTIFFFocalLength(metadata.get(Metadata.FOCAL_LENGTH));
        //extractedContent.setTIFFImageLength(metadata.get(Metadata.IMAGE_LENGTH));
        //extractedContent.setTIFFImageWidth(metadata.get(Metadata.IMAGE_WIDTH));
        //extractedContent.setTIFFISOSpeedRating(metadata.get(Metadata.ISO_SPEED_RATINGS));
        //extractedContent.setTIFFOrientation(metadata.get(Metadata.ORIENTATION));
        //extractedContent.setTIFFOriginalDate(metadata.get(Metadata.ORIGINAL_DATE));
        //extractedContent.setTIFFResolutionHorizontal(metadata.get(Metadata.RESOLUTION_HORIZONTAL));
        //extractedContent.setTIFFResolutionUnit(metadata.get(Metadata.RESOLUTION_UNIT));
        //extractedContent.setTIFFResolutionVertical(metadata.get(Metadata.RESOLUTION_VERTICAL));
        //extractedContent.setTIFFSamplePerPixel(metadata.get(Metadata.SAMPLES_PER_PIXEL));
        //extractedContent.setTIFFSoftware(metadata.get(Metadata.SOFTWARE));

        // TIKA METADATA KEYS
        extractedContent.setResourceNameKey(metadata.get(Metadata.RESOURCE_NAME_KEY));

        // TIKA MIME KEYS
        extractedContent.setMimeTypeMagic(metadata.get(Metadata.MIME_TYPE_MAGIC));
        extractedContent.setTikaMimeType(metadata.get(Metadata.TIKA_MIME_FILE));
    }

    //
    return extractedContent;
}

From source file:ca.uhn.fhir.validation.SchemaBaseValidator.java

private Source loadXml(String theSystemId, String theSchemaName) {
    String pathToBase = myCtx.getVersion().getPathToSchemaDefinitions() + '/' + theSchemaName;
    ourLog.debug("Going to load resource: {}", pathToBase);
    InputStream baseIs = FhirValidator.class.getResourceAsStream(pathToBase);
    if (baseIs == null) {
        throw new InternalErrorException("Schema not found. " + RESOURCES_JAR_NOTE);
    }// w w  w.j  a v a 2s .c o m
    baseIs = new BOMInputStream(baseIs, false);
    InputStreamReader baseReader = new InputStreamReader(baseIs, Charset.forName("UTF-8"));
    Source baseSource = new StreamSource(baseReader, theSystemId);

    return baseSource;
}

From source file:com.bigdata.rdf.rio.ntriples.BigdataNTriplesParser.java

/**
 * Implementation of the <tt>parse(InputStream, String)</tt> method defined
 * in the RDFParser interface.//from w ww .jav  a 2 s.c  o m
 * 
 * @param in
 *        The InputStream from which to read the data, must not be
 *        <tt>null</tt>. The InputStream is supposed to contain 7-bit
 *        US-ASCII characters, as per the N-Triples specification.
 * @param baseURI
 *        The URI associated with the data in the InputStream, must not be
 *        <tt>null</tt>.
 * @throws IOException
 *         If an I/O error occurred while data was read from the InputStream.
 * @throws RDFParseException
 *         If the parser has found an unrecoverable parse error.
 * @throws RDFHandlerException
 *         If the configured statement handler encountered an unrecoverable
 *         error.
 * @throws IllegalArgumentException
 *         If the supplied input stream or base URI is <tt>null</tt>.
 */
@Override
public synchronized void parse(InputStream in, String baseURI)
        throws IOException, RDFParseException, RDFHandlerException {
    if (in == null) {
        throw new IllegalArgumentException("Input stream can not be 'null'");
    }
    // Note: baseURI will be checked in parse(Reader, String)

    try {
        parse(new InputStreamReader(new BOMInputStream(in, false), "US-ASCII"), baseURI);
    } catch (UnsupportedEncodingException e) {
        // Every platform should support the US-ASCII encoding...
        throw new RuntimeException(e);
    }
}

From source file:com.enonic.cms.core.resource.FileResourceServiceImpl.java

@Override
public InputStream getResourceStream(final FileResourceName name, final boolean ignoreBom) {
    final File file = getFile(name);

    if (!file.isFile()) {
        return null;
    }/*from ww  w  .j av  a 2  s.c  o m*/

    try {
        final FileInputStream in = new FileInputStream(file);
        if (ignoreBom) {
            return new BOMInputStream(in, false);
        } else {
            return in;
        }
    } catch (final IOException e) {
        throw new RuntimeException("Failed to open " + name.getPath());
    }
}

From source file:at.jku.ce.adaptivetesting.vaadin.ui.topic.accounting.AccountingQuestionManager.java

public int loadQuestions(File containingFolder) throws JAXBException, IOException {
    assert containingFolder.exists() && containingFolder.isDirectory();
    JAXBContext accountingJAXB = JAXBContext.newInstance(XmlAccountingQuestion.class,
            AccountingDataStorage.class);
    JAXBContext profitJAXB = JAXBContext.newInstance(XmlProfitQuestion.class, ProfitDataStorage.class);

    Unmarshaller accountingUnmarshaller = accountingJAXB.createUnmarshaller();
    Unmarshaller profitUnmarshaller = profitJAXB.createUnmarshaller();

    final List<AccountingQuestion> accountingList = new ArrayList<>();
    final List<ProfitQuestion> profitList = new ArrayList<>();

    String accountingRootElement = XmlAccountingQuestion.class.getAnnotation(XmlRootElement.class).name();
    String profitRootElement = XmlProfitQuestion.class.getAnnotation(XmlRootElement.class).name();

    File[] questions = containingFolder
            .listFiles(f -> f.isFile() && (f.canRead() || f.setReadable(true)) && f.getName().endsWith(".xml"));

    // read all questions
    LogHelper.logInfo("Found " + questions.length + " potential questions");
    int successfullyLoaded = 0;
    for (File f : questions) {
        LogHelper.logInfo("Loading question with filename: " + f.getName());
        BufferedReader reader = null;
        StringBuilder sb = new StringBuilder();
        try {// ww w.j  a  v a  2s  .c o  m
            reader = new BufferedReader(new InputStreamReader(
                    new BOMInputStream(new FileInputStream(f), ByteOrderMark.UTF_8), "UTF8"));

            String line = null;
            while ((line = reader.readLine()) != null) {
                sb.append(line);
            }
        } finally {
            if (reader != null) {
                reader.close();
            }
        }
        String fileAsString = sb.toString().replaceAll("& ", "&amp; ");
        if (fileAsString.contains(profitRootElement)) {
            LogHelper.logInfo("Question detected as " + ProfitQuestion.class.getName());
            // Profit Question
            XmlProfitQuestion question = (XmlProfitQuestion) profitUnmarshaller
                    .unmarshal(new StringReader(fileAsString));
            profitList.add(AccountingXmlHelper.fromXml(question, f.getName()));
            successfullyLoaded++;
        } else if (fileAsString.contains(accountingRootElement)) {
            LogHelper.logInfo("Question detected as " + AccountingQuestion.class.getName());
            // Accounting Question
            XmlAccountingQuestion question = (XmlAccountingQuestion) accountingUnmarshaller
                    .unmarshal(new StringReader(fileAsString));
            accountingList.add(AccountingXmlHelper.fromXml(question, f.getName()));
            successfullyLoaded++;
        } else {
            LogHelper.logInfo(
                    "QuestionManager: item type not supported for " + f.getName() + ", ignoring file.");
            //            throw new IllegalArgumentException(
            //                  "Question type not supported. File: " + f);
            continue;
        }
        LogHelper.logInfo("Loaded question with filename:" + f.getName());
    }
    // Add question to the question manager
    accountingList.forEach(q -> addQuestion(q));
    profitList.forEach(q -> addQuestion(q));
    LogHelper.logInfo("Successfully loaded " + successfullyLoaded + " questions.");
    return questions.length;
}

From source file:com.ggvaidya.scinames.model.Dataset.java

/**
 * Load this dataset from a CSV file. We load the entire CSV file, except
 * for blank cells./*  w w w .j  a  v  a2s  .  com*/
 * 
 * @param project The project to which the resulting Dataset should belong
 * @param csvFormat The CSV format of the input file.
 * @param csvFile The input file to load.
 * @param renamedColumns Rename these columns on the fly.
 * @return
 * @throws IOException 
 */
public static Dataset fromCSV(CSVFormat csvFormat, File csvFile) throws IOException {
    Dataset dataset = new Dataset(csvFile.getName(), new SimplifiedDate(), Dataset.TYPE_CHECKLIST);

    // Get ready to filter input files.
    InputStream ins = new FileInputStream(csvFile);

    // Look for BOMs and discard!
    ins = new BOMInputStream(ins, false);

    // Convert into a Reader.
    Reader reader = new BufferedReader(new InputStreamReader(ins));

    // Load CSV
    CSVParser parser = csvFormat.withHeader().parse(reader);
    Map<String, Integer> headerMap = parser.getHeaderMap();

    dataset.setColumns(headerMap.entrySet().stream().sorted((Object o1, Object o2) -> {
        Map.Entry<String, Integer> e1 = (Map.Entry) o1;
        Map.Entry<String, Integer> e2 = (Map.Entry) o2;

        return e1.getValue().compareTo(e2.getValue());
    }).map(e -> e.getKey()).map(colName -> DatasetColumn.of(colName))
            /*
            .map(col -> {
               // Rename any renamedColumns.
               if(renamedColumns.containsKey(col))
            return renamedColumns.get(col);
               else
            return col;
            })*/
            .collect(Collectors.toList()));

    dataset.rows.clear();
    dataset.rows.addAll(parser.getRecords().stream().map(record -> {
        DatasetRow row = new DatasetRow(dataset);
        row.putAll(record.toMap());
        return row;
    }).collect(Collectors.toList()));

    return dataset;
}

From source file:adept.io.Reader.java

/**
 * Find stream in classpath or file system.
 *
 * @param name the name/*from  ww w. j a  v  a2 s .  co m*/
 * @return the input stream
 * @throws FileNotFoundException the file not found exception
 */
public static BOMInputStream findStreamInClasspathOrFileSystem(String name) throws FileNotFoundException {
    InputStream is = Reader.class.getClassLoader().getResourceAsStream(name);
    BOMInputStream bis = new BOMInputStream(is, false);
    if (is == null) {
        is = Reader.class.getClassLoader().getResourceAsStream(name.replaceAll("\\\\", "/"));
        bis = new BOMInputStream(is, false);
        if (is == null) {
            File f = new File(name);
            if (!f.exists()) {
                System.out.println("Warning - creating FileInputStream: " + f.getAbsolutePath());
            }
            is = new FileInputStream(name);
            bis = new BOMInputStream(is, false);
            //System.out.println("Reading InputStream: " + f.getAbsolutePath());
        } else {
            System.out.println("InputStream found in Class Loader after adjusting separators.");
        }
    } else {
        //System.out.println("InputStream found in Class Loader.");
    }
    return bis;
}

From source file:adept.io.Reader.java

/**
 * Read file into lines.//from   w w w .  ja v a2 s .  c om
 *
 * @param filename the filename
 * @param lines    the lines
 * @return the string
 */
public String readFileIntoLines(String filename, List<String> lines) {
    if (lines == null)
        lines = new ArrayList<String>();
    String line = "";
    StringBuffer sb = new StringBuffer();
    try {
        BufferedReader in = new BufferedReader(new InputStreamReader(
                new BOMInputStream(new FileInputStream(new File(filename)), false), "UTF-8"));
        while ((line = in.readLine()) != null) {
            if (!line.isEmpty()) {
                String surrogatesRemoved = checkSurrogates(line);
                lines.add(surrogatesRemoved);
                sb.append(surrogatesRemoved);
                sb.append("\n");
            }
        }
        in.close();
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return sb.toString();

}

From source file:adept.io.Reader.java

/**
 * File to lines.//from  ww w  .j  av a 2s .co m
 *
 * @param filename the filename
 * @return the list
 */
public List<String> fileToLines(String filename) {
    List<String> lines = new LinkedList<String>();
    String line = "";
    try {
        BufferedReader in = new BufferedReader(new InputStreamReader(
                new BOMInputStream(new FileInputStream(new File(filename)), false), "UTF-8"));
        while ((line = in.readLine()) != null) {
            lines.add(checkSurrogates(line));
        }
        in.close();
    } catch (IOException ex) {
        throw new RuntimeException(ex);
    }
    return lines;
}