def ngram_alignments(self, ref, hyp):
"""n-gram alignment of ref and hyp.
The algorithm is based on the finding of matching n-grams, in the
range of a given gap. If 1-gram, keep only hypothesis items with a
high confidence score. A gap of search has to be fixed.
An interstice value ensure the gap between an item in the ref and
in the hyp won't be too far.
:param ref: (list of tokens) List of references
:param hyp: (list of tuples) List of hypothesis with their scores
The scores are supposed to range in [0;1] values.
:returns: List of alignments indexes as tuples (i_ref,i_hyp),
Example:
ref: w0 w1 w2 w3 w4 w5 w6 w7 w8 w9 w10 w11 w12
| | | | | | |
| | | \\ | | /
| | | \\ | | /
hyp: w0 w1 w2 wX w3 w5 w6 wX w9
Returned matches:
- if n=3: [ (0,0), (1,1), (2,2) ]
- if n=2: [(0, 0), (1, 1), (2, 2), (5, 5), (6, 6)]
- if n=1, it depends on the scores in hyp and the value of the gap.
"""
alignment = []
nman, nasr = self._create_ngrams(ref, hyp)
print(nman)
print(nasr)
lastidxa = len(nasr)
lastidxm = len(nman)
lastidx = min(lastidxa, lastidxm)
idxa = 0
idxm = 0
while idxa < lastidxa and idxm < lastidx + self._gap - 1:
found = False
if idxm < lastidxm and nasr[idxa] == nman[idxm]:
for i in range(self._ngram):
alignment.append((idxm + i, idxa + i))
found = True
if idxm < lastidxm:
for gap in range(self._gap):
if not found and idxm < lastidxm - gap - 1:
if nasr[idxa] == nman[idxm + gap + 1]:
idxm = idxm + gap + 1
for i in range(self._ngram):
alignment.append((idxm + i, idxa + i))
found = True
if idxm > 0:
for gap in range(self._gap):
if not found and idxm > gap + 1:
if nasr[idxa] == nman[idxm - gap - 1]:
idxm = idxm - gap - 1
for i in range(self._ngram):
alignment.append((idxm + i, idxa + i))
found = True
idxa += 1
idxm += 1
interstice = math.fabs(idxm - idxa)
if interstice > self._interstice:
vmax = max(idxa, idxm)
idxa = vmax
idxm = vmax
return sorted(list(set(alignment)))