marytts.tools.analysis.CopySynthesis.java Source code

Java tutorial

Introduction

Here is the source code for marytts.tools.analysis.CopySynthesis.java

Source

/**
 * Copyright 2009 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

package marytts.tools.analysis;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.Arrays;
import java.util.Formatter;
import java.util.Locale;
import java.util.regex.Pattern;

import javax.sound.sampled.AudioInputStream;
import javax.sound.sampled.AudioSystem;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;

import marytts.client.http.MaryHttpClient;
import marytts.datatypes.MaryXML;
import marytts.modules.phonemiser.AllophoneSet;
import marytts.signalproc.analysis.F0TrackerAutocorrelationHeuristic;
import marytts.signalproc.analysis.Label;
import marytts.signalproc.analysis.Labels;
import marytts.signalproc.analysis.PitchFileHeader;
import marytts.signalproc.analysis.PitchReaderWriter;
import marytts.util.data.audio.AudioDoubleDataSource;
import marytts.util.data.text.LabelfileDoubleDataSource;
import marytts.util.data.text.PraatPitchTier;
import marytts.util.dom.DomUtils;
import marytts.util.dom.MaryDomUtils;
import marytts.util.string.StringUtils;

import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.traversal.NodeIterator;

/**
 * Impose duration and/or intonation from one version of an utterance to another version.
 * The segmental chain of both utterances can have tiny deviations but must be sufficiently similar 
 * to allow the MaryTranscriptionAligner to match the two.
 * The destination must be in ALLOPHONES or ACOUSTPARAMS format, i.e. have syllabic and segmental substructure in the tokens.
 * The source can be in a number of formats.
 *  
 * @author marc
 *
 */
public class CopySynthesis {
    AllophoneSet allophoneSet;

    /**
     * Provide copy synthesis functionality for documents using the given allophone set.
     * @param allophoneSet the allophone set to use, or null if no allophone verification is needed.
     */
    public CopySynthesis(AllophoneSet allophoneSet) {
        this.allophoneSet = allophoneSet;
    }

    /**
     * Make sure that the label sequence as provided in source is copied into the target document.
     * @param source a label sequence consisting of valid allophones according to the allophone set given in the constructor.
     * @param target a MaryXML document at the stage of ALLOPHONES or ACOUSTPARAMS, i.e. with syllable/segment substructure in phones.
     */
    public void imposeSegments(Labels source, Document target) {
        MaryTranscriptionAligner aligner = new MaryTranscriptionAligner(allophoneSet, true);
        aligner.SetEnsureInitialBoundary(false);
        String labels = StringUtils.join(aligner.getEntrySeparator(), source.getLabelSymbols());
        aligner.alignXmlTranscriptions(target, labels);

    }

    /**
     * Make sure that the label sequence as provided in source is copied into the target document, and
     * that the phone and boundary durations in the target are adjusted to those in source.
     * @param source a label sequence consisting of valid allophones according to the allophone set given in the constructor.
     * @param target a MaryXML document at the stage of ALLOPHONES or ACOUSTPARAMS, i.e. with syllable/segment substructure in phones.
     */
    public void imposeDurations(Labels source, Document target) {
        imposeSegments(source, target);
        // we trust that the following holds: (see CopySynthesisTest):
        // assert Arrays.deepEquals(source.getLabelSymbols(), new Labels(target).getLabelSymbols());
        // or in other words, the number of labels in source are now the same as the number of
        // phone and boundary items in target.
        String PHONE = "ph";
        String BOUNDARY = "boundary";
        NodeIterator it = DomUtils.createNodeIterator(target, PHONE, BOUNDARY);

        Element e = null;
        double prevEndTime = 0;
        int iSource = 0;
        while ((e = (Element) it.nextNode()) != null) {
            updateDurationAndEndTime(e, source.items[iSource], prevEndTime);
            prevEndTime = source.items[iSource].time;
            iSource++;
        }
    }

    private void updateDurationAndEndTime(Element e, Label label, double prevEndTime) {
        String PHONE = "ph";
        String A_PHONE_DURATION = "d";
        String A_PHONE_SYMBOL = "p";
        String A_PHONE_END = "end";
        String BOUNDARY = "boundary";
        String A_BOUNDARY_DURATION = "duration";

        assert label.time > prevEndTime;
        double durationSeconds = label.time - prevEndTime;
        String durationMillisString = String.valueOf((int) Math.round(1000 * durationSeconds));
        // System.out.println("For label "+label+", setting duration "+durationSeconds+" -> "+durationMillisString);
        if (e.getTagName().equals(PHONE)) {
            assert label.phn.equals(e.getAttribute(A_PHONE_SYMBOL));
            e.setAttribute(A_PHONE_DURATION, durationMillisString);
            e.setAttribute(A_PHONE_END, String.valueOf(label.time));
        } else {
            assert e.getTagName().equals(BOUNDARY);
            e.setAttribute(A_BOUNDARY_DURATION, durationMillisString);
        }

    }

    /**
     * Make sure that 1. the label sequence as provided in source is copied into the target document,
     * 2. the phone and boundary durations in the target are adjusted to those in source,
     * and 3. the intonation targets in the target are replaced by those in the pitchSource.
     * @param durationAndSegment a label sequence consisting of valid allophones according to the allophone set given in the constructor.
     * @param pitchSource a specification of an intonation contour.
     * @param target a MaryXML document at the stage of ALLOPHONES or ACOUSTPARAMS, i.e. with syllable/segment substructure in phones.
     */
    public void imposeIntonation(Labels durationAndSegmentSource, PraatPitchTier pitchSource, Document target) {
        throw new RuntimeException("Not yet implemented");
    }

    /**
     * @param args
     */
    public static void main(String[] args) throws Exception {
        String wavFilename = null;
        String labFilename = null;
        String pitchFilename = null;
        String textFilename = null;

        String locale = System.getProperty("locale");
        if (locale == null) {
            throw new IllegalArgumentException("No locale given (-Dlocale=...)");
        }

        for (String arg : args) {
            if (arg.endsWith(".txt"))
                textFilename = arg;
            else if (arg.endsWith(".wav"))
                wavFilename = arg;
            else if (arg.endsWith(".ptc"))
                pitchFilename = arg;
            else if (arg.endsWith(".lab"))
                labFilename = arg;
            else
                throw new IllegalArgumentException("Don't know how to treat argument: " + arg);
        }

        // The intonation contour
        double[] contour = null;
        double frameShiftTime = -1;
        if (pitchFilename == null) { // need to create pitch contour from wav file 
            if (wavFilename == null) {
                throw new IllegalArgumentException("Need either a pitch file or a wav file");
            }
            AudioInputStream ais = AudioSystem.getAudioInputStream(new File(wavFilename));
            AudioDoubleDataSource audio = new AudioDoubleDataSource(ais);
            PitchFileHeader params = new PitchFileHeader();
            params.fs = (int) ais.getFormat().getSampleRate();
            F0TrackerAutocorrelationHeuristic tracker = new F0TrackerAutocorrelationHeuristic(params);
            tracker.pitchAnalyze(audio);
            frameShiftTime = tracker.getSkipSizeInSeconds();
            contour = tracker.getF0Contour();
        } else { // have a pitch file -- ignore any wav file
            PitchReaderWriter f0rw = new PitchReaderWriter(pitchFilename);
            if (f0rw.contour == null) {
                throw new NullPointerException("Cannot read f0 contour from " + pitchFilename);
            }
            contour = f0rw.contour;
            frameShiftTime = f0rw.header.skipSizeInSeconds;
        }
        assert contour != null;
        assert frameShiftTime > 0;

        // The ALLOPHONES data and labels
        if (labFilename == null) {
            throw new IllegalArgumentException("No label file given");
        }
        if (textFilename == null) {
            throw new IllegalArgumentException("No text file given");
        }
        MaryTranscriptionAligner aligner = new MaryTranscriptionAligner();
        aligner.SetEnsureInitialBoundary(false);
        String labels = MaryTranscriptionAligner.readLabelFile(aligner.getEntrySeparator(),
                aligner.getEnsureInitialBoundary(), labFilename);
        MaryHttpClient mary = new MaryHttpClient();
        String text = FileUtils.readFileToString(new File(textFilename), "ASCII");
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        mary.process(text, "TEXT", "ALLOPHONES", locale, null, null, baos);
        ByteArrayInputStream bais = new ByteArrayInputStream(baos.toByteArray());
        DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance();
        docFactory.setNamespaceAware(true);
        DocumentBuilder builder = docFactory.newDocumentBuilder();
        Document doc = builder.parse(bais);
        aligner.alignXmlTranscriptions(doc, labels);
        assert doc != null;

        // durations
        double[] endTimes = new LabelfileDoubleDataSource(new File(labFilename)).getAllData();
        assert endTimes.length == labels.split(Pattern.quote(aligner.getEntrySeparator())).length;

        // Now add durations and f0 targets to document
        double prevEnd = 0;
        NodeIterator ni = MaryDomUtils.createNodeIterator(doc, MaryXML.PHONE, MaryXML.BOUNDARY);
        for (int i = 0; i < endTimes.length; i++) {
            Element e = (Element) ni.nextNode();
            if (e == null)
                throw new IllegalStateException("More durations than elements -- this should not happen!");
            double durInSeconds = endTimes[i] - prevEnd;
            int durInMillis = (int) (1000 * durInSeconds);
            if (e.getTagName().equals(MaryXML.PHONE)) {
                e.setAttribute("d", String.valueOf(durInMillis));
                e.setAttribute("end", new Formatter(Locale.US).format("%.3f", endTimes[i]).toString());
                // f0 targets at beginning, mid, and end of phone
                StringBuilder f0String = new StringBuilder();
                double startF0 = getF0(contour, frameShiftTime, prevEnd);
                if (startF0 != 0 && !Double.isNaN(startF0)) {
                    f0String.append("(0,").append((int) startF0).append(")");
                }
                double midF0 = getF0(contour, frameShiftTime, prevEnd + 0.5 * durInSeconds);
                if (midF0 != 0 && !Double.isNaN(midF0)) {
                    f0String.append("(50,").append((int) midF0).append(")");
                }
                double endF0 = getF0(contour, frameShiftTime, endTimes[i]);
                if (endF0 != 0 && !Double.isNaN(endF0)) {
                    f0String.append("(100,").append((int) endF0).append(")");
                }
                if (f0String.length() > 0) {
                    e.setAttribute("f0", f0String.toString());
                }
            } else { // boundary
                e.setAttribute("duration", String.valueOf(durInMillis));
            }
            prevEnd = endTimes[i];
        }
        if (ni.nextNode() != null) {
            throw new IllegalStateException("More elements than durations -- this should not happen!");
        }

        // TODO: add pitch values

        String acoustparams = DomUtils.document2String(doc);
        System.out.println("ACOUSTPARAMS:");
        System.out.println(acoustparams);
    }

    /**
     * For the given sampled contour and skipSize, provide the f0 value at time t if
     * possible or Double.NaN otherwise.
     * @param contour
     * @param skipSize
     * @param t
     * @return
     */
    private static double getF0(double[] contour, double skipSize, double t) {
        int i = (int) (t / skipSize);
        if (i >= 0 && i < contour.length)
            return contour[i];
        return Double.NaN;
    }

}