SPPAS Automatic Self-Repetition Detection.
Detect self-repetitions. The result has never been validated by an expert. This annotation is performed on the basis of time-aligned tokens or lemmas. The output is made of 2 tiers with sources and echos.
SPPAS Automatic Self-Repetition Detection.
Detect self-repetitions. The result has never been validated by an expert. This annotation is performed on the basis of time-aligned tokens or lemmas. The output is made of 2 tiers with sources and echos.
Create a new sppasRepetition instance.
def __init__(self, log=None):
"""Create a new sppasRepetition instance.
:param log: (sppasLog) Human-readable logs.
"""
super(sppasSelfRepet, self).__init__('selfrepet.json', log)
Self-Repetition detection.
def self_detection(self, tier):
"""Self-Repetition detection.
:param tier: (sppasTier)
"""
trs_output = sppasTranscription(self.name)
trs_output.create_tier('SR-Source')
trs_output.create_tier('SR-SrcStrain')
trs_output.create_tier('SR-SrcLen')
trs_output.create_tier('SR-SrcType')
trs_output.create_tier('SR-Repet')
stop_words = self._stop_words.copy()
stop_words.evaluate(tier, merge=True)
repetition = SelfRepetition(stop_words)
tok_start, tok_search, tok_end = self.__fix_indexes(tier, 0, 0)
while tok_start < tok_end:
tokens = [serialize_labels(tier[i].get_labels()) for i in range(tok_start, tok_end + 1)]
speaker = DataSpeaker(tokens)
limit = tok_search - tok_start
repetition.detect(speaker, limit)
shift = 1
if repetition.get_source() is not None:
sppasSelfRepet.__add_repetition(repetition, tier, tok_start, trs_output)
src_start, src_end = repetition.get_source()
shift = src_end + 1
tok_start, tok_search, tok_end = self.__fix_indexes(tier, tok_start, shift)
return trs_output
Run the automatic annotation process on an input.
def run(self, input_files, output=None):
"""Run the automatic annotation process on an input.
:param input_files: (list of str) Time-aligned tokens
:param output: (str) the output file name
:returns: (sppasTranscription)
"""
parser = sppasTrsRW(input_files[0])
trs_input = parser.read()
tier_tokens = sppasFindTier.aligned_tokens(trs_input)
tier_input = self.make_word_strain(tier_tokens)
trs_output = self.self_detection(tier_input)
trs_output.set_meta('self_repetition_result_of', input_files[0])
self.transfer_metadata(trs_input, trs_output)
if len(self._word_strain) > 0:
trs_output.append(tier_input)
if self._options['stopwords'] is True:
trs_output.append(self.make_stop_words(tier_input))
if output is not None:
if len(trs_output) > 0:
output_file = self.fix_out_file_ext(output)
parser = sppasTrsRW(output_file)
parser.write(trs_output)
return [output_file]
else:
raise EmptyOutputError
return trs_output
Pattern this annotation uses in an output filename.
def get_output_pattern(self):
"""Pattern this annotation uses in an output filename."""
return self._options.get('outputpattern', '-srepet')
Pattern this annotation expects for its input filename.
def get_input_pattern(self):
"""Pattern this annotation expects for its input filename."""
return self._options.get('inputpattern', '-palign')
Return the index of the next interval representing a break.
It depends on the 'span' value.
@staticmethod
def __find_next_break(tier, start, span):
"""Return the index of the next interval representing a break.
It depends on the 'span' value.
:param tier: (sppasTier)
:param start: (int) the position of the token where the search will start
:param span: (int)
:returns: (int) index of the next interval corresponding to the span
"""
nb_breaks = 0
for i in range(start, len(tier)):
if serialize_labels(tier[i].get_labels()) == SIL_ORTHO:
nb_breaks += 1
if nb_breaks == span:
return i
return len(tier) - 1
def __fix_indexes(self, tier, tok_start, shift):
tok_start += shift
tok_search = sppasSelfRepet.__find_next_break(tier, tok_start + 1, span=1)
tok_end = sppasSelfRepet.__find_next_break(tier, tok_start + 1, span=self._options['span'])
return (tok_start, tok_search, tok_end)
Add a repetition - source and echos - in tiers.
@staticmethod
def __add_repetition(repetition, spk_tier, start_idx, trs_out):
"""Add a repetition - source and echos - in tiers.
:param repetition: (DataRepetition)
:param spk_tier: (sppasTier) The tier of the speaker (to detect sources)
:param start_idx: (int) start index of the interval in spk_tier
:param trs_out: (sppasTranscription)
:returns: (bool) the repetition was added or not
"""
src_tier = trs_out.find('SR-Source')
echo_tier = trs_out.find('SR-Repet')
sr_index = len(src_tier)
s, e = repetition.get_source()
src_begin = spk_tier[start_idx + s].get_lowest_localization()
src_end = spk_tier[start_idx + e].get_highest_localization()
iitime = sppasInterval(src_begin.copy(), src_end.copy())
try:
a = src_tier.create_annotation(sppasLocation(iitime), sppasLabel(sppasTag('S' + str(sr_index + 1))))
src_id = a.get_meta('id')
except:
return False
echo_labels = list()
for s, e in repetition.get_echos():
rep_begin = spk_tier[start_idx + s].get_lowest_localization()
rep_end = spk_tier[start_idx + e].get_highest_localization()
eetime = sppasInterval(rep_begin.copy(), rep_end.copy())
anns = spk_tier.find(rep_begin, rep_end)
for a in anns:
for lbl in a.get_labels():
echo_labels.append(lbl.copy())
a = echo_tier.create_annotation(sppasLocation(eetime), sppasLabel(sppasTag('R' + str(sr_index + 1))))
a.set_meta('is_self_repetition_of', src_id)
anns = spk_tier.find(src_begin, src_end)
src_labels = list()
for a in anns:
for lbl in a.get_labels():
src_labels.append(lbl.copy())
a = trs_out.find('SR-SrcStrain').create_annotation(sppasLocation(iitime), src_labels)
a.set_meta('source_id', src_id)
a = trs_out.find('SR-SrcLen').create_annotation(sppasLocation(iitime), sppasLabel(sppasTag(len(src_labels), 'int')))
a.set_meta('source_id', src_id)
or_type = 'variation'
if len(repetition.get_echos()) > 1:
or_type = 'split:{:d}'.format(len(repetition.get_echos()))
elif len(src_labels) > len(echo_labels):
or_type = 'reduction'
elif len(src_labels) == len(echo_labels):
equals = True
for ls, le in zip(src_labels, echo_labels):
if ls.get_best() != le.get_best():
equals = False
break
if equals is True:
or_type = 'strict'
a = trs_out.find('SR-SrcType').create_annotation(sppasLocation(iitime), sppasLabel(sppasTag(or_type)))
a.set_meta('source_id', src_id)
logging.info('OR {:d}. {} {} -> {:s}'.format(sr_index + 1, src_labels, echo_labels, or_type))
return True