Module sppas.src.resources

Class sppasUnigram

Description

Class to represent a simple unigram: a set of token/count.

An unigram is commonly a data structure with tokens and their probabilities, and a back-off value. Is is a statistical language model. This class is a simplified version with only tokens and their occurrences.

Notice that tokens are case-sensitive.

Constructor

Create a sppasUnigram instance.

Parameters

filename: (str) Name of the file with words and counts (2 columns)
nodump: (bool) Disable the creation of a dump file

View Source

def __init__(self, filename=None, nodump=True):
    """Create a sppasUnigram instance.

    :param filename: (str) Name of the file with words and counts     (2 columns)
    :param nodump: (bool) Disable the creation of a dump file

    """
    self.__sum = 0
    self.__entries = dict()
    if filename is not None:
        data = None
        dp = sppasDumpFile(filename)
        if nodump is False:
            data = dp.load_from_dump()
        if data is None:
            self.load_from_ascii(filename)
            if nodump is False:
                dp.save_as_dump(self.__entries)
        else:
            self.__entries = data

Public functions

add

Add or increment a token in the unigram.

Parameters

entry: (str) String of the token to add
value: (int) Value to increment the count

Raises

PositiveValueError

View Source

def add(self, entry, value=1):
    """Add or increment a token in the unigram.

        :param entry: (str) String of the token to add
        :param value: (int) Value to increment the count
        :raises: PositiveValueError

        """
    entry = sppasUnicode(entry).to_strip()
    value = int(value)
    if value <= 0:
        raise PositiveValueError(count=value)
    count = self.__entries.get(entry, 0) + value
    self.__entries[entry] = count
    self.__sum += value

get_count

Return the count of a token.

Parameters

token: (str) The string of the token

View Source

def get_count(self, token):
    """Return the count of a token.

        :param token: (str) The string of the token

        """
    s = sppasUnicode(token).to_strip()
    return self.__entries.get(s, 0)

get_sum

Return the sum of all counts (of all tokens).

View Source

def get_sum(self):
    """Return the sum of all counts (of all tokens)."""
    return self.__sum

get_tokens

Return a list with all tokens.

View Source

def get_tokens(self):
    """Return a list with all tokens."""
    return self.__entries.keys()

load_from_ascii

Load a unigram from a file with two columns: word count.

Parameters

filename: (str) Name of the unigram ASCII file to read

View Source

def load_from_ascii(self, filename):
    """Load a unigram from a file with two columns: word count.

        :param filename: (str) Name of the unigram ASCII file to read

        """
    with codecs.open(filename, 'r', sg.__encoding__) as fd:
        lines = fd.readlines()
    for line in lines:
        line = ' '.join(line.split())
        if len(line) == 0:
            continue
        tabline = line.split()
        if len(tabline) < 2:
            continue
        key = tabline[0]
        value = int(tabline[1])
        self.add(key, value)

save_as_ascii

Save a unigram into a file with two columns: word freq.

Parameters

filename: (str) Name of the unigram ASCII file to write

Returns

(bool)

View Source

def save_as_ascii(self, filename):
    """Save a unigram into a file with two columns: word freq.

        :param filename: (str) Name of the unigram ASCII file to write
        :returns: (bool)

        """
    try:
        with codecs.open(filename, 'w', encoding=sg.__encoding__) as output:
            for entry, value in sorted(self.__entries.items(), key=lambda x: x[0]):
                output.write('{:s} {:d}\n'.format(entry, value))
    except Exception as e:
        logging.info('Save file failed due to the following error: {:s}'.format(str(e)))
        return False
    return True

Overloads

len

View Source

def __len__(self):
    return len(self.__entries)

contains

View Source

def __contains__(self, item):
    s = sppasUnicode(item).to_strip()
    return s in self.__entries