Module pysimt.utils.io

I/O related utility implementations.

Expand source code
"""I/O related utility implementations."""

import bz2
import gzip
import lzma
import pathlib

from collections import deque
from typing import List, Iterable, Any

import numpy as np
from tqdm import tqdm


class FileRotator:
    """A fixed queue with Path() elements where pushing a new element pops
    the oldest one and removes it from disk.

    Arguments:
        maxlen(int): The capacity of the queue.
    """

    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.elems = deque(maxlen=self.maxlen)

    def push(self, elem):
        if len(self.elems) == self.maxlen:
            # Remove oldest item
            popped = self.elems.pop()
            if popped.exists():
                popped.unlink()

        # Add new item
        self.elems.appendleft(elem)

    def __repr__(self):
        return self.elems.__repr__()


def fopen(filename: str, key: str = None):
    """gzip,bzip2,xz,numpy aware file opening function."""
    assert '*' not in str(filename), "Glob patterns not supported in fopen()"

    filename = str(pathlib.Path(filename).expanduser())
    if filename.endswith('.gz'):
        return gzip.open(filename, 'rt')
    elif filename.endswith('.bz2'):
        return bz2.open(filename, 'rt')
    elif filename.endswith(('.xz', '.lzma')):
        return lzma.open(filename, 'rt')
    elif filename.endswith(('.npy', '.npz')):
        if filename.endswith('.npz'):
            assert key is not None, "No key= given for .npz file."
            return np.load(filename)[key]
        else:
            return np.load(filename)
    else:
        # Plain text
        return open(filename, 'r')


def read_hypothesis_file(fname: str) -> List[str]:
    """Reads lines from a text file and returns it as a list of strings."""
    lines = []
    with open(fname) as f:
        for line in f:
            lines.append(line.strip())
    return lines


def read_reference_files(*args) -> List[List[str]]:
    """Read every file given in `args` and produce a list of lists that
    supports multiple references."""
    all_lines = []

    for fname in args:
        lines = []
        with open(fname) as f:
            for line in f:
                lines.append(line.strip())
        all_lines.append(lines)

    ref_lens = [len(lns) for lns in all_lines]
    assert len(set(ref_lens)) == 1, \
        "Reference streams do not have the same lengths."

    return all_lines


def progress_bar(iterator: Iterable[Any], unit: str = 'it'):
    """Wraps the given iterator into tqdm for progress bar rendering."""
    return tqdm(iterator, unit=unit, ncols=70, smoothing=0)

Functions

def fopen(filename: str, key: str = None)

gzip,bzip2,xz,numpy aware file opening function.

Expand source code
def fopen(filename: str, key: str = None):
    """gzip,bzip2,xz,numpy aware file opening function."""
    assert '*' not in str(filename), "Glob patterns not supported in fopen()"

    filename = str(pathlib.Path(filename).expanduser())
    if filename.endswith('.gz'):
        return gzip.open(filename, 'rt')
    elif filename.endswith('.bz2'):
        return bz2.open(filename, 'rt')
    elif filename.endswith(('.xz', '.lzma')):
        return lzma.open(filename, 'rt')
    elif filename.endswith(('.npy', '.npz')):
        if filename.endswith('.npz'):
            assert key is not None, "No key= given for .npz file."
            return np.load(filename)[key]
        else:
            return np.load(filename)
    else:
        # Plain text
        return open(filename, 'r')
def progress_bar(iterator: Iterable[Any], unit: str = 'it')

Wraps the given iterator into tqdm for progress bar rendering.

Expand source code
def progress_bar(iterator: Iterable[Any], unit: str = 'it'):
    """Wraps the given iterator into tqdm for progress bar rendering."""
    return tqdm(iterator, unit=unit, ncols=70, smoothing=0)
def read_hypothesis_file(fname: str) ‑> List[str]

Reads lines from a text file and returns it as a list of strings.

Expand source code
def read_hypothesis_file(fname: str) -> List[str]:
    """Reads lines from a text file and returns it as a list of strings."""
    lines = []
    with open(fname) as f:
        for line in f:
            lines.append(line.strip())
    return lines
def read_reference_files(*args) ‑> List[List[str]]

Read every file given in args and produce a list of lists that supports multiple references.

Expand source code
def read_reference_files(*args) -> List[List[str]]:
    """Read every file given in `args` and produce a list of lists that
    supports multiple references."""
    all_lines = []

    for fname in args:
        lines = []
        with open(fname) as f:
            for line in f:
                lines.append(line.strip())
        all_lines.append(lines)

    ref_lens = [len(lns) for lns in all_lines]
    assert len(set(ref_lens)) == 1, \
        "Reference streams do not have the same lengths."

    return all_lines

Classes

class FileRotator (maxlen)

A fixed queue with Path() elements where pushing a new element pops the oldest one and removes it from disk.

Arguments

maxlen(int): The capacity of the queue.

Expand source code
class FileRotator:
    """A fixed queue with Path() elements where pushing a new element pops
    the oldest one and removes it from disk.

    Arguments:
        maxlen(int): The capacity of the queue.
    """

    def __init__(self, maxlen):
        self.maxlen = maxlen
        self.elems = deque(maxlen=self.maxlen)

    def push(self, elem):
        if len(self.elems) == self.maxlen:
            # Remove oldest item
            popped = self.elems.pop()
            if popped.exists():
                popped.unlink()

        # Add new item
        self.elems.appendleft(elem)

    def __repr__(self):
        return self.elems.__repr__()

Methods

def push(self, elem)
Expand source code
def push(self, elem):
    if len(self.elems) == self.maxlen:
        # Remove oldest item
        popped = self.elems.pop()
        if popped.exists():
            popped.unlink()

    # Add new item
    self.elems.appendleft(elem)