SPPAS 4.22

https://sppas.org/

Module sppas.src.resources

Class sppasDictRepl

Description

A dictionary to manage automated replacements.

A dictionary with specific features for language resources. The main feature is that values are "accumulated".

Example
 >>> d = sppasDictRepl()
 >>> d.add("key", "v1")
 >>> d.add("key", "v2")
 >>> d.get("key")
 >>> v1|v2
 >>> d.is_value("v1")
 >>> True
 >>> d.is_value("v1|v2")
 >>> False

Constructor

Create a sppasDictRepl instance.

Parameters
  • dict_filename: (str) The dictionary file name (2 columns)
  • nodump: (bool) Disable the creation of a dump file A dump file is a binary version of the dictionary. Its size is greater than the original ASCII dictionary but the time to load it is divided by two or three.
View Source
def __init__(self, dict_filename=None, nodump=False):
    """Create a sppasDictRepl instance.

    :param dict_filename: (str) The dictionary file name (2 columns)
    :param nodump: (bool) Disable the creation of a dump file
    A dump file is a binary version of the dictionary. Its size is greater
    than the original ASCII dictionary but the time to load it is divided
    by two or three.

    """
    self._dict = dict()
    self._filename = ''
    if dict_filename is not None:
        self._filename = dict_filename
        data = None
        dp = sppasDumpFile(dict_filename)
        if nodump is False:
            data = dp.load_from_dump()
        if data is None:
            self.load_from_ascii(dict_filename)
            if nodump is False:
                dp.save_as_dump(self._dict)
        else:
            self._dict = data

Public functions

get_filename

Return the name of the file from which the vocab comes from.

View Source
def get_filename(self):
    """Return the name of the file from which the vocab comes from."""
    return self._filename

is_key

Return True if entry is exactly a key in the dictionary.

Parameters
  • entry: (str) Unicode string.
View Source
def is_key(self, entry):
    """Return True if entry is exactly a key in the dictionary.

        :param entry: (str) Unicode string.

        """
    return u(entry) in self._dict

is_value

Return True if entry is a value in the dictionary.

Parameters
  • entry: (str) Unicode string.
View Source
def is_value(self, entry):
    """Return True if entry is a value in the dictionary.

        :param entry: (str) Unicode string.

        """
    s = sppasDictRepl.format_token(entry)
    for v in self._dict.values():
        values = v.split(sppasDictRepl.REPLACE_SEPARATOR)
        for val in values:
            if val == s:
                return True
    return False

is_value_of

Return True if entry is a value of a given key in the dictionary.

Parameters
  • key: (str) Unicode string.
  • entry: (str) Unicode string.
View Source
def is_value_of(self, key, entry):
    """Return True if entry is a value of a given key in the dictionary.

        :param key: (str) Unicode string.
        :param entry: (str) Unicode string.

        """
    s = sppasDictRepl.format_token(entry)
    v = self.get(key, '')
    values = v.split(sppasDictRepl.REPLACE_SEPARATOR)
    for val in values:
        if val == s:
            return True
    return False

is_unk

Return True if entry is not a key in the dictionary.

Parameters
  • entry: (str) Unicode string.
View Source
def is_unk(self, entry):
    """Return True if entry is not a key in the dictionary.

        :param entry: (str) Unicode string.

        """
    s = sppasDictRepl.format_token(entry)
    return s not in self._dict

is_empty

Return True if there is no entry in the dictionary.

View Source
def is_empty(self):
    """Return True if there is no entry in the dictionary."""
    return len(self._dict) == 0

get

Return the value of a key of the dictionary or substitution.

Parameters
  • entry: (str) A token to find in the dictionary
  • substitution: (str) String to return if token is missing of the dict
Returns
  • unicode of the replacement or the substitution.
View Source
def get(self, entry, substitution=''):
    """Return the value of a key of the dictionary or substitution.

        :param entry: (str) A token to find in the dictionary
        :param substitution: (str) String to return if token is missing of the dict
        :returns: unicode of the replacement or the substitution.

        """
    s = sppasDictRepl.format_token(entry)
    return self._dict.get(s, substitution)

replace

Return the value of a key or None if key has no replacement.

Parameters
  • key
View Source
def replace(self, key):
    """Return the value of a key or None if key has no replacement."""
    return self.get(key)

replace_reversed

Return the key(s) of a value or an empty string.

Parameters
  • value: (str) value to search
Returns
  • a unicode string with all keys, separated by '_', or an empty string if value does not exists.
View Source
def replace_reversed(self, value):
    """Return the key(s) of a value or an empty string.

        :param value: (str) value to search
        :returns: a unicode string with all keys, separated by '_', or an empty string if value does not exists.

        """
    s = sppasDictRepl.format_token(value)
    keys = []
    for k, v in self._dict.items():
        values = v.split(sppasDictRepl.REPLACE_SEPARATOR)
        for val in values:
            if val == s:
                keys.append(k)
    if len(keys) == 0:
        return ''
    return sppasDictRepl.REPLACE_SEPARATOR.join(keys)

format_token

Remove the CR/LF, tabs, multiple spaces and others... and lower.

Parameters
  • entry: (str) a token
Returns
  • formatted token
View Source
@staticmethod
def format_token(entry):
    """Remove the CR/LF, tabs, multiple spaces and others... and lower.

        :param entry: (str) a token
        :returns: formatted token

        """
    return sppasUnicode(entry).to_strip()

add

Add a new key,value into the dict.

Add as a new pair or append the value to the existing one with a "|" used as separator.

Parameters
  • token: (str) string of the token to add
  • repl: (str) the replacement token

Both token and repl are converted to unicode (if any) and strip.

View Source
def add(self, token, repl):
    """Add a new key,value into the dict.

        Add as a new pair or append the value to the existing one with
        a "|" used as separator.

        :param token: (str) string of the token to add
        :param repl: (str) the replacement token

        Both token and repl are converted to unicode (if any) and strip.

        """
    key = sppasDictRepl.format_token(token)
    value = sppasDictRepl.format_token(repl)
    if key in self._dict:
        if self.is_value_of(key, value) is False:
            value = '{0}|{1}'.format(self._dict.get(key), value)
    self._dict[key] = value

pop

Remove an entry, as key.

Parameters
  • entry: (str) unicode string of the entry to remove
View Source
def pop(self, entry):
    """Remove an entry, as key.

        :param entry: (str) unicode string of the entry to remove

        """
    s = sppasDictRepl.format_token(entry)
    if s in self._dict:
        self._dict.pop(s)

remove

Remove an entry, as key or value.

Parameters
  • entry: (str) unicode string of the entry to remove
View Source
def remove(self, entry):
    """Remove an entry, as key or value.

        :param entry: (str) unicode string of the entry to remove

        """
    s = sppasDictRepl.format_token(entry)
    to_pop = list()
    for k in self._dict.keys():
        if k == s or self.is_value_of(k, entry):
            to_pop.append(k)
    for k in to_pop:
        self._dict.pop(k)

load_from_ascii

Load a replacement dictionary from an ascii file.

Parameters
  • filename: (str) Replacement dictionary file name
View Source
def load_from_ascii(self, filename):
    """Load a replacement dictionary from an ascii file.

        :param filename: (str) Replacement dictionary file name

        """
    with codecs.open(filename, 'r', sg.__encoding__) as fd:
        try:
            lines = fd.readlines()
        except UnicodeDecodeError:
            raise FileUnicodeError(filename=filename)
        fd.close()
    self._filename = filename
    for line in lines:
        line = ' '.join(line.split())
        if len(line) == 0:
            continue
        tab_line = line.split()
        if len(tab_line) < 2:
            continue
        key = tab_line[0]
        value = sppasDictRepl.REPLACE_SEPARATOR.join(tab_line[1:])
        self.add(key, value)

save_as_ascii

Save the replacement dictionary.

Parameters
  • filename: (str)
Returns
  • (bool)
View Source
def save_as_ascii(self, filename):
    """Save the replacement dictionary.

        :param filename: (str)
        :returns: (bool)

        """
    try:
        with codecs.open(filename, 'w', encoding=sg.__encoding__) as output:
            for entry, value in sorted(self._dict.items(), key=lambda x: x[0]):
                values = value.split(sppasDictRepl.REPLACE_SEPARATOR)
                for v in values:
                    output.write('{:s} {:s}\n'.format(entry, v.strip()))
    except Exception as e:
        logging.info('Saving file failed due to the following error: {:s}'.format(str(e)))
        return False
    return True

Overloads

__str__

View Source
def __str__(self):
    return str(self._dict)

__len__

View Source
def __len__(self):
    return len(self._dict)

__contains__

View Source
def __contains__(self, item):
    s = sppasDictRepl.format_token(item)
    return s in self._dict

__iter__

View Source
def __iter__(self):
    for a in self._dict:
        yield a

__getitem__

View Source
def __getitem__(self, item):
    s = sppasDictRepl.format_token(str(item))
    return self._dict[s]