# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.CuedSpeech.lpcvideo.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Tag a video with the Cued Speech keys.
.. _This file is part of SPPAS: <http://www.sppas.org/>
..
---------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
---------------------------------------------------------------------
"""
import logging
from sppas.src.config import sppasError
from sppas.src.anndata import sppasTier
from sppas.src.videodata import sppasVideoWriter
from sppas.src.imgdata import sppasCoords
from sppas.src.annotations.FaceSights import sppasSightsVideoReader
from sppas.src.annotations.FaceSights import Sights
from .videokeys import sppasKeysVideoBuffer
from .videokeys import sppasKeysVideoWriter
# ---------------------------------------------------------------------------
MSG_ERROR_MISMATCH = "The given {:d} coordinates in CSV file doesn't match " \
"the number of frames of the video {:d}"
# ---------------------------------------------------------------------------
[docs]class CuedSpeechVideoTagger(object):
"""Create a video with hands tagged on the face of a video.
"""
[docs] def __init__(self, video=None, csv_sights=None):
"""Create a new instance.
:param video: (str) Filename of the input video
:param csv_sights: (str) Filename of the CSV with sights
"""
# Vowel code rank
self.__vrank = "bcsmt"
self.__data = None
self.__video_buffer = sppasKeysVideoBuffer()
self.__video_writer = sppasKeysVideoWriter()
self.__video_writer.set_vowels_rank(self.__vrank)
self.__video_writer.set_options(csv=False, folder=False, tag=True, crop=False)
if video is not None and csv_sights is not None:
self.load(video, csv_sights)
# -----------------------------------------------------------------------
[docs] def vowel_rank(self, vowel_code):
"""Return an index from the code of a vowel or -1.
:param vowel_code: (char) One of b, c, s, m, t
"""
if vowel_code in self.__vrank:
return self.__vrank.index(vowel_code)
return -1
# -----------------------------------------------------------------------
[docs] def load(self, video, csv_sights):
"""Open the video and load the CVS data.
:param video: (str) Filename of the input video
:param csv_sights: (str) Filename of the CSV with sights
"""
self.close()
# Open the CSV file dans load all its data
self.__data = sppasSightsVideoReader(csv_sights)
# Only one kid is expected
for entry in self.__data.ids:
if len(entry) > 1:
raise ValueError("Only one identified face was expected. Get {:d}."
"".format(len(entry)))
# Open the video file
self.__video_buffer.open(video)
# The nb of lines in the CSV must correspond to the number of frames of the video
nframes = self.__video_buffer.get_nframes()
if len(self.__data.coords) != nframes:
# Release the video stream
self.__video_buffer.close()
self.__video_buffer.reset()
raise sppasError(MSG_ERROR_MISMATCH.format(len(self.__data.coords), nframes))
# Adjust the video writer
self.__video_writer.set_fps(self.__video_buffer.get_framerate())
# -----------------------------------------------------------------------
[docs] def close(self):
"""Release video streams."""
if self.__video_buffer is not None:
self.__video_buffer.close()
if self.__video_writer is not None:
self.__video_writer.close()
# -----------------------------------------------------------------------
def __del__(self):
self.close()
# -----------------------------------------------------------------------
[docs] def tag(self, syll_keys, output):
"""Tag the video with the given keys.
:param syll_keys: (sppasTier)
:param output: Output video filename
:return: list of created files -- expected 1
"""
if self.__video_buffer is None:
return ()
if self.__video_buffer.is_opened() is False:
return ()
result = list()
i = 0 # index of the first image of each buffer
nb = 0 # buffer number
read_next = True # reached the end of the video or not
self.__video_buffer.seek_buffer(0)
while read_next is True:
logging.info(" ... buffer number {:d}".format(nb + 1))
# Fill-in the buffer with images
read_next = self.__video_buffer.next()
# Use the 68 face sights to fix the positions of the 5 possible keys
self.__fix_vowels_position(self.__data.sights[i:i+len(self.__video_buffer)])
# Browse the tier to fix the key of each image of the buffer
image_duration = 1. / self.__video_buffer.get_framerate()
start_time = float(i) * image_duration
end_time = float(i + len(self.__video_buffer)) * image_duration
self.__fix_keys(start_time, end_time, syll_keys)
# Save the current result in a video
if output is not None:
new_files = self.__video_writer.write(self.__video_buffer, output, "")
result.extend(new_files)
nb += 1
i += len(self.__video_buffer)
return result
# -----------------------------------------------------------------------
def __fix_vowels_position(self, all_sights):
"""Fix the 5 vowels positions in each image of the buffer.
:param all_sights: (list of 68 sights)
"""
kid_sights = Sights(nb=68)
for buf_idx, sights in enumerate(all_sights):
# Get the sights of the kid in the image or use the previous ones
if len(sights) > 0:
kid_sights = sights[0]
# Among the 68, get the sights needed to fix the 5 positions of the keys
x0, y0, s0 = kid_sights.get_sight(0)
x2, y2, s2 = kid_sights.get_sight(2)
x8, y8, s8 = kid_sights.get_sight(8)
x36, y36, s36 = kid_sights.get_sight(36)
x48, y48, s48 = kid_sights.get_sight(48)
x57, y57, s57 = kid_sights.get_sight(57)
x60, y60, s60 = kid_sights.get_sight(60)
# Position 1 is close to the left eye (cheek bone)
x = x0 + ((x36 - x0) // 2)
y = y0 + (y0 - y36)
b = sppasCoords(x, y, confidence=s0)
# Position 2 is in the middle of the chin (chin)
x = x8
y = y8 - ((y8 - y57) // 4)
c = sppasCoords(x, y, confidence=s8)
# Position 3 is on the left side of the face (side)
x = max(0, x2 - (x36 - x0))
y = y2
s = sppasCoords(x, y, confidence=s2)
# Position 4 is at the left of the lips (mouth)
x = x48 - (x60 - x48)
y = y48 + ((y57 - y48) // 2)
m = sppasCoords(x, y, confidence=s48)
# Position 5 is at the glottis (throat)
x = x8
y = y8 + (y8-y57)
t = sppasCoords(x, y, confidence=s57)
# return coords in the "vowels rank" order
self.__video_buffer.set_coordinates(buf_idx, [b, c, s, m, t])
# -----------------------------------------------------------------------
def __fix_keys(self, start_time, end_time, syll_tier):
"""Fix the key of each image of the buffer.
:return: (list of tuples) List of vowel-consonant
"""
# Create a tier with only the annotations of the buffer to increase
# all the "find" needed later to browse the annotations
anns = syll_tier.find(start_time, end_time, overlaps=True)
tier = sppasTier("")
for a in anns:
tier.add(a)
image_duration = 1. / self.__video_buffer.get_framerate()
for buf_idx, image in enumerate(self.__video_buffer):
# Get the annotations during the image
s = start_time + (buf_idx * image_duration)
e = s + image_duration
anns = tier.find(s, e, overlaps=True)
if len(anns) == 0:
# There no key assigned to the image
self.__video_buffer.set_key(buf_idx, "0", "0")
elif len(anns) == 1:
# A key is matching the image time
labels = anns[0].get_labels()
if len(labels) == 2:
consonant = labels[0].get_best().get_content()
vowel = labels[1].get_best().get_content()
self.__video_buffer.set_key(buf_idx, consonant, vowel)
else:
raise ValueError(
"Two labels (consonant, vowel) were expected in "
"CuedSpeech. Got {:d} instead.".format(len(labels)))
else:
# There are several keys assigned to the image.
# It is probably a transition between 2 keys but both are
# too short in time to be drawn.
self.__video_buffer.set_key(buf_idx, "0", "0")