Module pysimt.utils.kaldi

Expand source code
import os
import struct
import functools

from typing import TextIO

import numpy

ERROR_BINARY = "Binary mode header ('\0B') not found when reading a matrix."
ERROR_READ_MAT = "Unknown matrix format '{}'. Supported ones: DM(float64), FM(float32)."
ERROR_WRITE_MAT = "Unknown matrix format '{}'. Supported ones are float64, float32."


def readString(f: TextIO) -> str:
    s = ""
    while True:
        c = f.read(1).decode('utf-8')
        if c == "":
            raise ValueError("EOF encountered while reading a string.")
        if c == " ":
            return s
        s += c


def readInteger(f: TextIO) -> str:
    n = ord(f.read(1))
    a = f.read(n)[::-1]
    try:
        return int.from_bytes(a, byteorder='big', signed=False)
    except Exception:
        return functools.reduce(lambda x, y: x * 256 + ord(y), a, 0)


def readMatrix(f):
    header = f.read(2).decode('utf-8')
    if header != "\0B":
        raise ValueError(ERROR_BINARY)
    mat_format = readString(f)
    nRows = readInteger(f)
    nCols = readInteger(f)
    if mat_format == "DM":
        data = struct.unpack("<%dd" % (nRows * nCols), f.read(nRows * nCols * 8))
        data = numpy.array(data, dtype="float64")
    elif mat_format == "FM":
        data = struct.unpack("<%df" % (nRows * nCols), f.read(nRows * nCols * 4))
        data = numpy.array(data, dtype="float32")
    else:
        raise ValueError(ERROR_READ_MAT.format(mat_format))
    return data.reshape(nRows, nCols)


def readMatrixShape(f):
    header = f.read(2).decode('utf-8')
    if header != "\0B":
        raise ValueError(
            "Binary mode header ('\0B') not found when attempting to read a matrix.")
    mat_format = readString(f)
    nRows = readInteger(f)
    nCols = readInteger(f)
    if mat_format == "DM":
        f.seek(nRows * nCols * 8, os.SEEK_CUR)
    elif mat_format == "FM":
        f.seek(nRows * nCols * 4, os.SEEK_CUR)
    else:
        raise ValueError(ERROR_READ_MAT.format(mat_format))
    return nRows, nCols


def writeString(f, s):
    f.write((s + " ").encode('utf-8'))


def writeInteger(f, a):
    s = struct.pack("<i", a)
    f.write(chr(len(s)).encode('utf-8') + s)


def writeMatrix(f, data):
    f.write('\0B'.encode('utf-8'))      # Binary data header
    if str(data.dtype) == "float64":
        writeString(f, "DM")
        writeInteger(f, data.shape[0])
        writeInteger(f, data.shape[1])
        f.write(struct.pack("<%dd" % data.size, *data.ravel()))
    elif str(data.dtype) == "float32":
        writeString(f, "FM")
        writeInteger(f, data.shape[0])
        writeInteger(f, data.shape[1])
        f.write(struct.pack("<%df" % data.size, *data.ravel()))
    else:
        raise ValueError(ERROR_WRITE_MAT.format(str(data.dtype)))


def readArk(filename, limit=numpy.inf):
    """
    Reads the features in a Kaldi ark file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with open(filename, "rb") as f:
        while True:
            try:
                uttid = readString(f)
            except ValueError:
                break
            feature = readMatrix(f)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit:
                break
    return features, uttids


def readMatrixByOffset(arkfile, offset):
    with open(arkfile, "rb") as g:
        g.seek(offset)
        feature = readMatrix(g)
    return feature


def readScp(filename, limit=numpy.inf):
    """
    Reads the features in a Kaldi script file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with open(arkfile, "rb") as g:
                g.seek(offset)
                feature = readMatrix(g)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit:
                break
    return features, uttids


def read_scp_info(filename, limit=numpy.inf):
    res = []
    with open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with open(arkfile, "rb") as g:
                g.seek(offset)
                feat_len, feat_dim = readMatrixShape(g)
            res.append((uttid, arkfile, offset, feat_len, feat_dim))
            if len(res) == limit:
                break
    return res


def read_scp_info_dic(filename, limit=numpy.inf):
    res = {}
    with open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with open(arkfile, "rb") as g:
                g.seek(offset)
                feat_len, feat_dim = readMatrixShape(g)
            res[uttid] = ((uttid, arkfile, offset, feat_len, feat_dim))
            if len(res) == limit:
                break
    return res


def writeArk(filename, features, uttids):
    """
    Takes a list of feature matrices and a list of utterance IDs,
      and writes them to a Kaldi ark file.
    Returns a list of strings in the format "filename:offset",
      which can be used to write a Kaldi script file.
    """
    pointers = []
    with open(filename, "ab") as f:
        for feature, uttid in zip(features, uttids):
            writeString(f, uttid)
            pointers.append("%s:%d" % (filename, f.tell()))
            writeMatrix(f, feature)
    return pointers


def writeScp(filename, uttids, pointers):
    """
    Takes a list of utterance IDs and a list of strings in the format "filename:offset",
      and writes them to a Kaldi script file.
    """
    with open(filename, "w") as f:
        for uttid, pointer in zip(uttids, pointers):
            f.write("%s %s\n" % (uttid, pointer))

Functions

def readArk(filename, limit=inf)

Reads the features in a Kaldi ark file. Returns a list of feature matrices and a list of the utterance IDs.

Expand source code
def readArk(filename, limit=numpy.inf):
    """
    Reads the features in a Kaldi ark file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with open(filename, "rb") as f:
        while True:
            try:
                uttid = readString(f)
            except ValueError:
                break
            feature = readMatrix(f)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit:
                break
    return features, uttids
def readInteger(f: ) ‑> str
Expand source code
def readInteger(f: TextIO) -> str:
    n = ord(f.read(1))
    a = f.read(n)[::-1]
    try:
        return int.from_bytes(a, byteorder='big', signed=False)
    except Exception:
        return functools.reduce(lambda x, y: x * 256 + ord(y), a, 0)
def readMatrix(f)
Expand source code
def readMatrix(f):
    header = f.read(2).decode('utf-8')
    if header != "\0B":
        raise ValueError(ERROR_BINARY)
    mat_format = readString(f)
    nRows = readInteger(f)
    nCols = readInteger(f)
    if mat_format == "DM":
        data = struct.unpack("<%dd" % (nRows * nCols), f.read(nRows * nCols * 8))
        data = numpy.array(data, dtype="float64")
    elif mat_format == "FM":
        data = struct.unpack("<%df" % (nRows * nCols), f.read(nRows * nCols * 4))
        data = numpy.array(data, dtype="float32")
    else:
        raise ValueError(ERROR_READ_MAT.format(mat_format))
    return data.reshape(nRows, nCols)
def readMatrixByOffset(arkfile, offset)
Expand source code
def readMatrixByOffset(arkfile, offset):
    with open(arkfile, "rb") as g:
        g.seek(offset)
        feature = readMatrix(g)
    return feature
def readMatrixShape(f)
Expand source code
def readMatrixShape(f):
    header = f.read(2).decode('utf-8')
    if header != "\0B":
        raise ValueError(
            "Binary mode header ('\0B') not found when attempting to read a matrix.")
    mat_format = readString(f)
    nRows = readInteger(f)
    nCols = readInteger(f)
    if mat_format == "DM":
        f.seek(nRows * nCols * 8, os.SEEK_CUR)
    elif mat_format == "FM":
        f.seek(nRows * nCols * 4, os.SEEK_CUR)
    else:
        raise ValueError(ERROR_READ_MAT.format(mat_format))
    return nRows, nCols
def readScp(filename, limit=inf)

Reads the features in a Kaldi script file. Returns a list of feature matrices and a list of the utterance IDs.

Expand source code
def readScp(filename, limit=numpy.inf):
    """
    Reads the features in a Kaldi script file.
    Returns a list of feature matrices and a list of the utterance IDs.
    """
    features = []
    uttids = []
    with open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with open(arkfile, "rb") as g:
                g.seek(offset)
                feature = readMatrix(g)
            features.append(feature)
            uttids.append(uttid)
            if len(features) == limit:
                break
    return features, uttids
def readString(f: ) ‑> str
Expand source code
def readString(f: TextIO) -> str:
    s = ""
    while True:
        c = f.read(1).decode('utf-8')
        if c == "":
            raise ValueError("EOF encountered while reading a string.")
        if c == " ":
            return s
        s += c
def read_scp_info(filename, limit=inf)
Expand source code
def read_scp_info(filename, limit=numpy.inf):
    res = []
    with open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with open(arkfile, "rb") as g:
                g.seek(offset)
                feat_len, feat_dim = readMatrixShape(g)
            res.append((uttid, arkfile, offset, feat_len, feat_dim))
            if len(res) == limit:
                break
    return res
def read_scp_info_dic(filename, limit=inf)
Expand source code
def read_scp_info_dic(filename, limit=numpy.inf):
    res = {}
    with open(filename, "r") as f:
        for line in f:
            uttid, pointer = line.strip().split()
            p = pointer.rfind(":")
            arkfile, offset = pointer[:p], int(pointer[p + 1:])
            with open(arkfile, "rb") as g:
                g.seek(offset)
                feat_len, feat_dim = readMatrixShape(g)
            res[uttid] = ((uttid, arkfile, offset, feat_len, feat_dim))
            if len(res) == limit:
                break
    return res
def writeArk(filename, features, uttids)

Takes a list of feature matrices and a list of utterance IDs, and writes them to a Kaldi ark file. Returns a list of strings in the format "filename:offset", which can be used to write a Kaldi script file.

Expand source code
def writeArk(filename, features, uttids):
    """
    Takes a list of feature matrices and a list of utterance IDs,
      and writes them to a Kaldi ark file.
    Returns a list of strings in the format "filename:offset",
      which can be used to write a Kaldi script file.
    """
    pointers = []
    with open(filename, "ab") as f:
        for feature, uttid in zip(features, uttids):
            writeString(f, uttid)
            pointers.append("%s:%d" % (filename, f.tell()))
            writeMatrix(f, feature)
    return pointers
def writeInteger(f, a)
Expand source code
def writeInteger(f, a):
    s = struct.pack("<i", a)
    f.write(chr(len(s)).encode('utf-8') + s)
def writeMatrix(f, data)
Expand source code
def writeMatrix(f, data):
    f.write('\0B'.encode('utf-8'))      # Binary data header
    if str(data.dtype) == "float64":
        writeString(f, "DM")
        writeInteger(f, data.shape[0])
        writeInteger(f, data.shape[1])
        f.write(struct.pack("<%dd" % data.size, *data.ravel()))
    elif str(data.dtype) == "float32":
        writeString(f, "FM")
        writeInteger(f, data.shape[0])
        writeInteger(f, data.shape[1])
        f.write(struct.pack("<%df" % data.size, *data.ravel()))
    else:
        raise ValueError(ERROR_WRITE_MAT.format(str(data.dtype)))
def writeScp(filename, uttids, pointers)

Takes a list of utterance IDs and a list of strings in the format "filename:offset", and writes them to a Kaldi script file.

Expand source code
def writeScp(filename, uttids, pointers):
    """
    Takes a list of utterance IDs and a list of strings in the format "filename:offset",
      and writes them to a Kaldi script file.
    """
    with open(filename, "w") as f:
        for uttid, pointer in zip(uttids, pointers):
            f.write("%s %s\n" % (uttid, pointer))
def writeString(f, s)
Expand source code
def writeString(f, s):
    f.write((s + " ").encode('utf-8'))