Source code for anndata.aio.xtrans
# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.anndata.aio.xtrans.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Input/Output of XTrans.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
XTrans is a multi-platform, multilingual, multi-channel transcription tool
that supports manual transcription and annotation of audio recordings.
Last version of Xtrans was released in 2009.
https://www.ldc.upenn.edu/language-resources/tools/xtrans
"""
import codecs
from sppas.src.config import sg
from ..anndataexc import AioLineFormatError
from ..anndataexc import AnnDataTypeError
from ..ann.annlocation import sppasLocation
from ..ann.annlocation import sppasPoint
from ..ann.annlocation import sppasInterval
from .text import sppasBaseText
from .aioutils import load
from .aioutils import format_labels
# ----------------------------------------------------------------------------
[docs]class sppasTDF(sppasBaseText):
"""SPPAS TDF reader.
This class implements a TDF reader, but not a writer.
TDF is a Tab-Delimited Format. It contains 13 columns but SPPAS only
extracts 8 of them.
TDF does not support alternatives labels nor locations. Only the ones
with the best score are saved.
TDF can save several tiers.
TDF does not support controlled vocabularies.
TDF does not support hierarchy.
TDF does not support metadata.
TDF supports media assignment.
TDF supports intervals only.
TDF does not support alternative tags.
TDF does not support radius.
"""
[docs] @staticmethod
def detect(filename):
"""Check whether a file is of TDF format or not.
:param filename: (str) Name of the file to check.
:returns: (bool)
"""
# Open and load the content.
try:
with codecs.open(filename, 'r', sg.__encoding__) as fp:
lines = fp.readlines()
fp.close()
except IOError:
# can't open the file
return False
except UnicodeDecodeError:
# can't open with SPPAS default encoding
return False
# Check each line
for line in lines:
if sppasTDF.is_comment(line):
continue
tab = line.split('\t')
if len(tab) < 10: # expected is 13
return False
return True
# -----------------------------------------------------------------------
[docs] @staticmethod
def make_point(midpoint):
"""The localization is a time value, so always a float."""
try:
midpoint = float(midpoint)
except ValueError:
raise AnnDataTypeError(midpoint, "float")
return sppasPoint(midpoint, radius=0.005)
# -----------------------------------------------------------------------
[docs] def __init__(self, name=None):
"""Initialize a new sppasTDF instance.
:param name: (str) This transcription name.
"""
if name is None:
name = self.__class__.__name__
super(sppasTDF, self).__init__(name)
self.default_extension = "tdf"
self.software = "Xtrans"
# override all
self._accept_multi_tiers = True
self._accept_no_tiers = True
self._accept_metadata = False
self._accept_ctrl_vocab = False
self._accept_media = False
self._accept_hierarchy = False
self._accept_point = True
self._accept_interval = True
self._accept_disjoint = False
self._accept_alt_localization = False
self._accept_alt_tag = False
self._accept_radius = False
self._accept_gaps = True
self._accept_overlaps = True
# -----------------------------------------------------------------------
[docs] def read(self, filename):
"""Read a raw file and fill the sppasTranscription.
It creates a tier for each speaker-channel observed in the file.
:param filename: (str)
"""
lines = load(filename)
if len(lines) < 2:
return
# The first line is the name of the columns
first_line = lines[0]
lines.pop(0)
self._extract_lines(first_line, lines)
# -----------------------------------------------------------------------
def _extract_lines(self, first_line, lines):
"""Extract the content of the TDF file.
:param first_line: The first line of the TDF file (column' names)
:param lines: the content of the file
"""
# The 1st line indicates the names of the columns.
column_names = first_line.split('\t')
# Find indexes of the relevant information
try:
# index function raises a ValueError if the string is missing
channel = column_names.index('channel;int')
speaker = column_names.index('speaker;unicode')
speaker_type = column_names.index('speakerType;unicode')
speaker_dialect = column_names.index('speakerDialect;unicode')
tag = column_names.index('transcript;unicode')
begin = column_names.index('start;float')
end = column_names.index('end;float')
media_url = column_names.index('file;unicode')
except ValueError:
raise AioLineFormatError(1, first_line)
# Extract rows, create tiers and metadata.
for i, line in enumerate(lines):
line = line.strip()
# ignore blank lines
if len(line) == 0:
continue
# a comment can contain metadata
if sppasBaseText.is_comment(line):
sppasBaseText._parse_comment(line, self)
continue
# a tab-delimited line
line = line.split('\t')
if len(line) < 10:
raise AioLineFormatError(i + 1, line)
# check for the tier (find it or create it)
tier_name = line[speaker] + '-' + line[channel]
tier = self.find(tier_name)
if tier is None:
# Create the media linked to the tier
media = sppasBaseText.create_media(line[media_url].strip(),
self)
# Create the tier and set metadata
tier = self.create_tier(tier_name, media=media)
tier.set_meta("media_channel", line[channel])
tier.set_meta("speaker_name", line[speaker])
tier.set_meta("speaker_type", line[speaker_type])
tier.set_meta("speaker_dialect", line[speaker_dialect])
# Add the new annotation
location = sppasLocation(sppasInterval(
sppasTDF.make_point(line[begin]),
sppasTDF.make_point(line[end])))
labels = format_labels(line[tag], separator="\n", empty="")
tier.create_annotation(location, labels)