Module sppas.src.annotations

Class sppasSelfRepet

Description

SPPAS Automatic Self-Repetition Detection.

Detect self-repetitions. The result has never been validated by an expert. This annotation is performed on the basis of time-aligned tokens or lemmas. The output is made of 2 tiers with sources and echos.

Constructor

Create a new sppasRepetition instance.

Parameters

log: (sppasLog) Human-readable logs.

View Source

def __init__(self, log=None):
    """Create a new sppasRepetition instance.

    :param log: (sppasLog) Human-readable logs.

    """
    super(sppasSelfRepet, self).__init__('selfrepet.json', log)

Public functions

self_detection

Self-Repetition detection.

Parameters

tier: (sppasTier)

View Source

def self_detection(self, tier):
    """Self-Repetition detection.

        :param tier: (sppasTier)

        """
    trs_output = sppasTranscription(self.name)
    trs_output.create_tier('SR-Source')
    trs_output.create_tier('SR-SrcStrain')
    trs_output.create_tier('SR-SrcLen')
    trs_output.create_tier('SR-SrcType')
    trs_output.create_tier('SR-Repet')
    stop_words = self._stop_words.copy()
    stop_words.evaluate(tier, merge=True)
    repetition = SelfRepetition(stop_words)
    tok_start, tok_search, tok_end = self.__fix_indexes(tier, 0, 0)
    while tok_start < tok_end:
        tokens = [serialize_labels(tier[i].get_labels()) for i in range(tok_start, tok_end + 1)]
        speaker = DataSpeaker(tokens)
        limit = tok_search - tok_start
        repetition.detect(speaker, limit)
        shift = 1
        if repetition.get_source() is not None:
            sppasSelfRepet.__add_repetition(repetition, tier, tok_start, trs_output)
            src_start, src_end = repetition.get_source()
            shift = src_end + 1
        tok_start, tok_search, tok_end = self.__fix_indexes(tier, tok_start, shift)
    return trs_output

run

Run the automatic annotation process on an input.

Parameters

input_files: (list of str) Time-aligned tokens
output: (str) the output file name

Returns

(sppasTranscription)

View Source

def run(self, input_files, output=None):
    """Run the automatic annotation process on an input.

        :param input_files: (list of str) Time-aligned tokens
        :param output: (str) the output file name
        :returns: (sppasTranscription)

        """
    parser = sppasTrsRW(input_files[0])
    trs_input = parser.read()
    tier_tokens = sppasFindTier.aligned_tokens(trs_input)
    tier_input = self.make_word_strain(tier_tokens)
    trs_output = self.self_detection(tier_input)
    trs_output.set_meta('self_repetition_result_of', input_files[0])
    self.transfer_metadata(trs_input, trs_output)
    if len(self._word_strain) > 0:
        trs_output.append(tier_input)
    if self._options['stopwords'] is True:
        trs_output.append(self.make_stop_words(tier_input))
    if output is not None:
        if len(trs_output) > 0:
            output_file = self.fix_out_file_ext(output)
            parser = sppasTrsRW(output_file)
            parser.write(trs_output)
            return [output_file]
        else:
            raise EmptyOutputError
    return trs_output

get_output_pattern

Pattern this annotation uses in an output filename.

View Source

def get_output_pattern(self):
    """Pattern this annotation uses in an output filename."""
    return self._options.get('outputpattern', '-srepet')

get_input_pattern

Pattern this annotation expects for its input filename.

View Source

def get_input_pattern(self):
    """Pattern this annotation expects for its input filename."""
    return self._options.get('inputpattern', '-palign')

Protected functions

__find_next_break

Return the index of the next interval representing a break.

It depends on the 'span' value.

Parameters

tier: (sppasTier)
start: (int) the position of the token where the search will start
span: (int)

Returns

(int) index of the next interval corresponding to the span

View Source

@staticmethod
def __find_next_break(tier, start, span):
    """Return the index of the next interval representing a break.

        It depends on the 'span' value.

        :param tier: (sppasTier)
        :param start: (int) the position of the token where the search will start
        :param span: (int)
        :returns: (int) index of the next interval corresponding to the span

        """
    nb_breaks = 0
    for i in range(start, len(tier)):
        if serialize_labels(tier[i].get_labels()) == SIL_ORTHO:
            nb_breaks += 1
            if nb_breaks == span:
                return i
    return len(tier) - 1

__fix_indexes

View Source

def __fix_indexes(self, tier, tok_start, shift):
    tok_start += shift
    tok_search = sppasSelfRepet.__find_next_break(tier, tok_start + 1, span=1)
    tok_end = sppasSelfRepet.__find_next_break(tier, tok_start + 1, span=self._options['span'])
    return (tok_start, tok_search, tok_end)

__add_repetition

Add a repetition - source and echos - in tiers.

Parameters

repetition: (DataRepetition)
spk_tier: (sppasTier) The tier of the speaker (to detect sources)
startidx: (*int*) start index of the interval in spktier
trs_out: (sppasTranscription)

Returns

(bool) the repetition was added or not

View Source

@staticmethod
def __add_repetition(repetition, spk_tier, start_idx, trs_out):
    """Add a repetition - source and echos - in tiers.

        :param repetition: (DataRepetition)
        :param spk_tier: (sppasTier) The tier of the speaker (to detect sources)
        :param start_idx: (int) start index of the interval in spk_tier
        :param trs_out: (sppasTranscription)
        :returns: (bool) the repetition was added or not

        """
    src_tier = trs_out.find('SR-Source')
    echo_tier = trs_out.find('SR-Repet')
    sr_index = len(src_tier)
    s, e = repetition.get_source()
    src_begin = spk_tier[start_idx + s].get_lowest_localization()
    src_end = spk_tier[start_idx + e].get_highest_localization()
    iitime = sppasInterval(src_begin.copy(), src_end.copy())
    try:
        a = src_tier.create_annotation(sppasLocation(iitime), sppasLabel(sppasTag('S' + str(sr_index + 1))))
        src_id = a.get_meta('id')
    except:
        return False
    echo_labels = list()
    for s, e in repetition.get_echos():
        rep_begin = spk_tier[start_idx + s].get_lowest_localization()
        rep_end = spk_tier[start_idx + e].get_highest_localization()
        eetime = sppasInterval(rep_begin.copy(), rep_end.copy())
        anns = spk_tier.find(rep_begin, rep_end)
        for a in anns:
            for lbl in a.get_labels():
                echo_labels.append(lbl.copy())
        a = echo_tier.create_annotation(sppasLocation(eetime), sppasLabel(sppasTag('R' + str(sr_index + 1))))
        a.set_meta('is_self_repetition_of', src_id)
    anns = spk_tier.find(src_begin, src_end)
    src_labels = list()
    for a in anns:
        for lbl in a.get_labels():
            src_labels.append(lbl.copy())
    a = trs_out.find('SR-SrcStrain').create_annotation(sppasLocation(iitime), src_labels)
    a.set_meta('source_id', src_id)
    a = trs_out.find('SR-SrcLen').create_annotation(sppasLocation(iitime), sppasLabel(sppasTag(len(src_labels), 'int')))
    a.set_meta('source_id', src_id)
    or_type = 'variation'
    if len(repetition.get_echos()) > 1:
        or_type = 'split:{:d}'.format(len(repetition.get_echos()))
    elif len(src_labels) > len(echo_labels):
        or_type = 'reduction'
    elif len(src_labels) == len(echo_labels):
        equals = True
        for ls, le in zip(src_labels, echo_labels):
            if ls.get_best() != le.get_best():
                equals = False
                break
        if equals is True:
            or_type = 'strict'
    a = trs_out.find('SR-SrcType').create_annotation(sppasLocation(iitime), sppasLabel(sppasTag(or_type)))
    a.set_meta('source_id', src_id)
    logging.info('OR {:d}. {} {} -> {:s}'.format(sr_index + 1, src_labels, echo_labels, or_type))
    return True