@InProceedings{bigi2024jep, author = {Brigitte Bigi and Núria Gala}, title = {Preuve de concept d'un système de génération automatique en Langue française Parlée Complétée}, booktitle= {XXXVe Journées d’Études sur la Parole (JEP)}, address= {Toulouse, France}, year = 2024, pages = 512-520, note = {https://inria.hal.science/hal-04623112}, abstract = {La Langue française Parlée Complétée (LfPC) est un système de communication développé pour les personnes sourdes afin de compléter la lecture labiale avec une main, au niveau phonétique. Il est utilisé par les enfants pour acquérir des compétences en lecture, en lecture labiale et en communication orale. L’objectif principal est de permettre aux enfants sourds de devenir des lecteurs et des locuteurs compétents en langue française. Nous proposons une preuve de concept (PoC) d’un système de réalité augmentée qui place automatiquement la représentation d’une main codeuse sur la vidéo pré-enregistrée d’un locuteur. Le PoC prédit la forme et la position de la main, le moment durant lequel elle doit être affichée, et ses coordonnées relativement au visage dans la vidéo. Des photos de mains sont ensuite juxtaposées à la vidéo. Des vidéos annotées automatiquement par le PoC ont été montrées à des personnes sourdes qui l’ont accueilli et évalué favorablement.} } @InProceedings{gala2024lrec, author = {Núria Gala and Brigitte Bigi and Marie Bauer}, title = {Automatically Estimating Textual and Phonemic Complexity for Cued Speech: How to See the Sounds from French Texts}, booktitle= {The 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING)}, address = {Turin, Italy}, year = 2024, pages = 1817-1824, note = {https://hal.science/hal-04580180}, abstract = {In this position paper we present a methodology to automatically annotate French text for Cued Speech (CS), a communication system developed for people with hearing loss to complement speech reading at the phonetic level. This visual communication mode uses handshapes in different placements near the face in combination with the mouth movements (called ‘cues’ or ‘keys’) to make the phonemes of spoken language look different from each other. CS is used to acquire skills in lip-reading, in oral communication and for reading. Despite many studies demonstrating its benefits, there are few resources available for learning and practicing it, especially in French. We thus propose a methodology to phonemize written corpora so that each word is aligned with the corresponding CS key(s). This methodology is proposed as part of a wider project aimed at creating an augmented reality system displaying a virtual coding hand where the user will be able to choose a text upon its complexity for cueing. } } @misc{bigi2024acs14, author = {Brigitte Bigi}, title = {Livrable WP1 - L4 : Détection des mains et du visage du locuteur}, address = {France}, year = 2024, url = {https://auto-cuedspeech.org/wp1l4.html}, note = {https://hal.science/hal-04518632}, abstract = {Ce livrable fait parti du projet "Voir les sons avec du Cued Speech Automatisé : la réalité augmentée au service des personnes sourdes" financé par la FIRAH. Il fait parti des résultats du Work Package 1 : "Collecte et annotation de corpus codés". Il décrit les annotations de la main codeuse et du visage. Pour chaque image de chaque vidéo, nous cherchons à déterminer les coordonnées de points spécifiques, à savoir 21 sur la main et 68 sur le visage. L'analyse de ces coordonnées permettra de modéliser la trajectoire suivie par la main et son inclinaison durant le codage, ainsi que la localisation des voyelles autour du visage. } } @InProceedings{pakrashi2023, title = {Automatic Syllabification of Bengali in SPPAS}, author = {Moumita Pakrashi and Brigitte Bigi and Shakuntala Mahanta}, booktitle = {26th Conference of the Oriental Conference on Speech Database and Assessments}, address = {Igdtuw, Delhi, India}, pages = {PID:157}, year = {2023}, note = {https://hal.science/hal-04349654}, abstract = {This paper describes the automatic syllabification process of Bengali speech in SPPAS software. The process of detecting syllable boundaries has been carried out using a rule-based system applied to the permissible syllable structures of Bengali. Syllable structures of Bengali has undergone a fair amount of research. In this paper, we verify the existing syllabification rules of Bengali through a speech corpus and then describe its implementation through the resource toolkit of SPPAS software. Finally, by evaluating the automated syllabification system with the manually aligned syllables of the corpus, we have successfully implemented this automatization task, which can significantly help to syllabify large scale speech and text databases in Bengali.} } @InBook{ghio2023dunod, chapter = {{Etudier le langage oral : les diff{\'e}rents types de donn{\'e}es recueillies}}, author = {Alain Ghio and Brigitte Bigi}, title = {{Introduction aux statistiques en sciences du langage}}, editor = {Clara Solier and Lucille Soulier and Nour Ezzedine}, publisher = {{Dunod}}, pages = {7-36}, year = {2023}, note = {https://hal.science/hal-04248428} } @inproceedings{welby23sigul, author={Pauline Welby and Brigitte Bigi and Antoine Corral and Fabrice Wacalie and Guillaume Wattelez}, title={{A Visit to the Cliffs of Jokin: A Role for Phonetizers in Second Language Pronunciation and Word Learning, with an Example from the Languages of New Caledonia }}, year=2023, booktitle={Proc. 2nd Annual Meeting of the ELRA/ISCA SIG on Under-resourced Languages (SIGUL 2023)}, pages={19--23}, doi={10.21437/SIGUL.2023-5}, url={https://www.isca-speech.org/archive/pdfs/sigul_2023/welby23_sigul.pdf}, note={https://amu.hal.science/hal-04381129}, abstract={In this position paper, we argue for a role for overt phonetizers in second language learning. Phonetization or letter-to-sound conversion is often used simply as a module of text-to-speech synthesis (TTS) or to create pronunciations for dictionaries. Based on evidence of the overwhelming influence of orthographic input on second language pronunciation and word learning, we argue that on their own (or coupled with TTS), phonetizers can be effective support tools for two broad groups: 1. language learners and instructors, and 2. non-specialized users. We address the issues involved and give the example of a multilingual phonetizer under development in New Caledonia, a special status collectivity of France in the South Pacific. Encountering words and names in one of the almost 30 languages of the indigenous Kanak people of New Caledonia is an everyday experience, for example, on class lists, road signs and in news articles. Pronouncing these words is often a challenge, since each of the languages has its own phonology and its own orthography. We discuss the motivation behind the phonetizer, challenges in its development, and potential applications, many of which are common to other endangered or vulnerable and under-resourced languages.} } @InProceedings{bigi2023ltclfpc, title = {An analysis of produced versus predicted French Cued Speech keys}, author = {Brigitte Bigi}, booktitle = {10th Language & Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics, ISBN: 978-83-232-4176-8}, address = {Poznań, Poland}, pages = {24-28}, year = {2023}, note = {https://hal.science/hal-04081282}, abstract = {Cued Speech is a communication system developed for deaf people to complement speechreading at the phonetic level with hands. This visual communication mode uses handshapes in different placements near the face in combination with the mouth movements of speech to make the phonemes of spoken language look different from each other. This paper presents an analysis on produced cues in 5 topics of CLeLfPC, a large corpus of read speech in French with Cued Speech. A phonemes-to-cues automatic system is proposed in order to predict the cue to be produced while speaking. This system is part of SPPAS-the automatic annotation an analysis of speech, an open source software tool. The predicted keys of the automatic system are compared to the produced keys of cuers. The number of inserted, deleted and substituted keys are analyzed. We observed that most of the differences between predicted and produced keys comes from 3 common position's substitutions by some of the cuers. } } @InProceedings{bigi2023ltcben, title = {Resources Creation of Bengali for SPPAS}, author = {Moumita Pakrashi and Brigitte Bigi and Shakuntala Mahanta}, booktitle = {10th Language & Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics, ISBN: 978-83-232-4176-8}, address = {Poznań, Poland}, pages = {218-222}, year = {2023}, note = {https://hal.science/hal-04081305}, abstract = {The development of HLT tools inevitably involves the need for language resources. However, only a handful number of languages possess such resources for free. This paper presents the development of speech tools for the Bengali language. Particularly, this paper focuses on developing language resources of a tokenizer, an automatic speech system for predicting the pronunciation of the words and their segmentation in this low-resourced language. The newly created resources have been integrated into SPPAS software tool and distributed under the terms of public licenses.} } @Article{cappellini2023, author = {Marco Cappellini and Benjamin Holt and Brigitte Bigi and Marion Tellier and Christelle Zielinski}, title = {A multimodal corpus to study videoconference interactions for techno-pedagogical competence in second language acquisition and teacher education}, journal = {Corpus}, volume = {24}, year = {2023}, url = {http://journals.openedition.org/corpus/7440}, abstract = {This article aims to describe the construction and annotation of a multimodal and multilingual (French, English, and Mandarin Chinese) corpus for the study of second language acquisition and professional development in language teaching. The corpus was built within a research project whose main objective is to determine which techno-semio-pedagogical competencies are developed informally and which ones require formal training. In this paper, we explain the theoretical framework adopted, characterized by an ecological approach to the interactive environment. We subsequently illustrate the procedures for the collection of audio, video and eye-tracking data. We also provide details of the annotation of raw data to produce a corpus of analysis drawing on different automatic and semi-automatic annotation tools. Finally, we explain how such a corpus enables us to study the development of learners’ multimodal competence and tutors’ techno-pedagogical competence and point to some other possible research questions.} } @misc{gala2023, author = {Núria Gala and Brigitte Bigi}, title = {Création de ressources en langue française parlée complétée pour faciliter l'accès à la langue orale via l'écrit}, address = {Lyon, France}, booktitle = {Journée Scientifique de l'Institut des Sciences et Techniques de la Réadaptation, Institut des Sciences et Techniques de la Réadaptation}, year = 2023, url = {https://auto-cuedspeech.org/documents/presentation_JISTR23/}, note = {https://hal.science/hal-04200112}, abstract = {La Langue française Parlée Complétée (LfPC ou Cued Speech) est un codage des sons via des informations visuelles : chaque son est représenté avec une forme de main pour une consonne et une position autour du visage pour une voyelle. La LfPC est utilisée par le public sourd et malentendant pour acquérir de bonnes compétences en lecture, en lecture labiale et en expression orale. Il permet notamment aux enfants sourds ou malentendants de devenir des bons lecteurs, compétence indispensable pour tous les apprentissages. Malgré les nombreux travaux démontrant ses avantages, il n’existe que peu de ressources en LfPC. Aussi, ce projet vise à développer des ressources pour l’apprentissage et la pratique de la LfPC. Nous nous proposons de développer un système de réalité augmentée qui place automatiquement la représentation d’une main codeuse sur une vidéo d’un locuteur pré-enregistré. Un logiciel de codage automatique LfPC et des bibliothèques de vidéos codées à but pédagogique seront ainsi produits en respectant tous les critères de l’Open Science. Le système de codage sera développé sur la base d’observations d’un corpus de 4 heures d’enregistrement audio/vidéo (Bigi et al., 2022). Les supports textuels de la ressource seront issus du projet ANR ALECTOR, ce sont des textes simplifiés pour faciliter la lecture (Gala et al. 2020). } } @misc{bigi2023acs21, author = {Brigitte Bigi}, title = {Livrable WP2 - L1 : Système prédictif des séquences de clés (français)}, address = {France}, year = 2023, url = {https://auto-cuedspeech.org/wp2l1.html}, note = {https://hal.science/hal-04348920}, abstract = {Ce livrable fait parti du projet "Voir les sons avec du Cued Speech Automatisé : la réalité augmentée au service des personnes sourdes" financé par la FIRAH. Il fait parti des résultats du Work Package 2 : Des phonèmes aux clés (QUOI ?). Il décrit le système de prédiction automatique des clés LfPC à partir d'un fichier audio et de sa transcription, pour la langue française, qui est disponible à partir de la version 4.10 de SPPAS (https://sppas.org). } } @misc{bigi2023acs12, author = {Brigitte Bigi}, title = {Livrable WP1 - L2 : Clés LfPC}, address = {France}, year = 2023, url = {https://auto-cuedspeech.org/wp1l2.html}, note = {https://hal.science/hal-04348901}, abstract = {Ce livrable fait parti du projet "Voir les sons avec du Cued Speech Automatisé : la réalité augmentée au service des personnes sourdes" financé par la FIRAH. Il fait parti des résultats du Work Package 1 : "WP1 – Collecte et annotation de corpus codés". Il décrit l'annotation qui permet d'indiquer quelles clés LfPC ont été réalisées durant la lecture. Elle concerne 10 locuteurs du corpus CLeLfPC, version 7, déposée sous licence CC-By-NC-4.0 sur https://www.ortolang.fr. } } @misc{bigi2023acs11, author = {Brigitte Bigi}, title = {Livrable WP1 - L1 : Segmentation phonétique}, address = {France}, year = 2023, url = {https://auto-cuedspeech.org/wp1l1.html}, note = {https://hal.science/hal-04348909}, abstract = {Ce livrable fait parti du projet "Voir les sons avec du Cued Speech Automatisé : la réalité augmentée au service des personnes sourdes" financé par la FIRAH. Il fait parti des résultats du Work Package 1 : "WP1 – Collecte et annotation de corpus codés". Il décrit l'annotation qui permet d'indiquer quel phonème a été prononcé, et à quel moment. Elle concerne 10 locuteurs du corpus CLeLfPC, version 6, déposée sous licence CC-By-NC-4.0 sur https://www.ortolang.fr. } } @misc{priego2022tipa, title = {CHEESE! : Corpus « CHEESE! »}, author = {Béatrice Priego-Valverde and Brigitte Bigi and Mary Amoyal}, journal = {Travaux interdisciplinaires sur la parole et le langage}, year = {2022}, volume = {38}, publisher={OpenEdition}, url = {https://doi.org/10.4000/tipa.5525}, } @misc{bigi2022tipalfpc, title = {CLeLfPC}, author = {Brigitte Bigi and Maryvone Zimmerman}, journal = {Travaux interdisciplinaires sur la parole et le langage}, year = {2022}, volume = {38}, publisher={OpenEdition}, url = {https://doi.org/10.4000/tipa.5778}, note = {https://hal.science/hal-04030609} } @misc{bigi2022tipasppas, title = {Fiche technique SPPAS}, author = {Brigitte Bigi}, journal = {Travaux interdisciplinaires sur la parole et le langage}, year = {2022}, volume = {38}, publisher={OpenEdition}, url = {https://doi.org/10.4000/tipa.5745}, note = {https://hal.science/hal-04004616} } @Article{bigi2022tipa, title = {Principes et outils pour l’annotation des corpus}, author = {Mary Amoyal and Roxane Bertrand and Brigitte Bigi and Auriane Boudin and Christine Meunier and Berthille Pallaud and Béatrice Priego-Valverde and Stéphane Rauzy and Marion Tellier}, journal = {Travaux interdisciplinaires sur la parole et le langage}, pages = {186-234}, year = {2022}, volume = {38}, publisher={OpenEdition}, url = {https://doi.org/10.4000/tipa.5424}, note = {https://hal.science/hal-03917814}, } @InProceedings{bigi2022lrec, author = {Brigitte Bigi and Maryvonne Zimmermann and Carine André}, title = {CLeLfPC: a Large Open Multi-Speaker Corpus of French Cued Speech}, booktitle = {Proceedings of The 13th Language Resources and Evaluation Conference}, year = {2022}, address = {Marseille, France}, publisher = {European Language Resources Association}, pages = {987-994}, url = {http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.104.pdf}, note = {https://hal.archives-ouvertes.fr/hal-03794830}, abstract = {Cued Speech is a communication system developed for deaf people to complement speechreading at the phonetic level with hands. This visual communication mode uses handshapes in different placements near the face in combination with the mouth movements of speech to make the phonemes of spoken language look different from each other. This paper describes CLeLfPC - Corpus de Lecture en Langue française Parlée Complétée, a corpus of French Cued Speech. It consists in about 4 hours of audio and HD video recordings of 23 participants. The recordings are 160 different isolated ‘CV’ syllables repeated 5 times, 320 words or phrases repeated 2-3 times and about 350 sentences repeated 2-3 times. The corpus is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License. It can be used for any further research or teaching purpose. The corpus includes orthographic transliteration and other phonetic annotations on 5 of the recorded topics, i.e. syllables, words, isolated sentences and a text. The early results are encouraging: it seems that 1/ the hand position has a high influence on the key audio duration; and 2/ the hand shape has not.} } @InProceedings{bigi2022sp, author = {Massimo Pettorino and Marta Maffia and Brigitte Bigi}, title = {{A longitudinal study on Italian speech rhythm in Parkinson’s Disease}}, booktitle = {Proceedings of Speech Prosody}, year = {2022}, address = {Lisbon, Portugal}, pages = {52-56}, url = {http://hdl.handle.net/11574/208038}, note = {https://hal.archives-ouvertes.fr/hal-03823951}, abstract = {Parkinson’s Disease dysarthria affects the speech motor control, causing alterations at the suprasegmental level of speech. In previous researches, vowel percentage (%V) and the mean interval between two consecutive vowel onset points (VtoV) were effectively used in the synchronic description of the rhythmic variations of Italian PD speech, compared to healthy speech, even at a very early stage of the disease. This study aims at verifying the early alteration of PD speech rhythm using a diachronic approach. To reach this goal, a corpus of read speech produced by a single PD subject (female, 66 years old) has been collected, consisting of 15 radiophonic speech samples (about 100 s each) on the same topic, recorded between 2001 and 2021. The speech samples were manually segmented in consonantal and vocalic intervals by means of Praat, allowing the calculation of %V and VtoV. The results show an alteration of %V values since 2018, two years before the diagnosis and the insurgence of motor symptoms. Moreover, first results of the application of the automatic segmentation performed by SPPAS on a selection of PD speech samples will also be presented.} } @Article{bigi2022lnai, title = {The automatic search for sounding segments of SPPAS: application to Cheese! corpus}, author = {Brigitte Bigi and Béatrice Priego-Valverde}, journal = {Human Language Technology. Challenges for Computer Science and Linguistics, LNAI}, pages = {16-27}, year = {2022}, volume = {LNCS 13212}, ISBN = {978-3-031-05327-6}, publisher={Springer International Publishing}, url = {https://dx.doi.org/10.1007/978-3-031-05328-3_2}, note = {https://hal.archives-ouvertes.fr/hal-03697808}, abstract = {The development of corpora inevitably involves the need for segmentation. For most of the corpora, the first segmentation to operate consist in determining silences vs Inter-Pausal Units - IPUs, i.e. sounding segments. This paper presents the "Search for IPUs" feature included in SPPAS - the automatic annotation and analysis of speech software tool distributed under the terms of public licenses. Particularly, this paper is focusing on its evaluation on Cheese! corpus, a corpus of reading then conversational speech between two participants. The paper reports the number of manual actions which was performed manually by the annotators in order to obtain the expected segmentation: add new IPUs, ignore irrelevant ones, split an IPU, merge two consecutive ones and move boundaries. The evaluation shows that the proposed fully automatic method is relevant.} } @misc{bigi2021clelfpc, author = {Brigitte Bigi and Maryvonne Zimmermann}, title = {{CLeLfPC : Corpus de Lecture en Langue française Parlée Complétée}}, url = {https://hdl.handle.net/11403/clelfpc}, year = {2021}, address = {http://sppas.org/LFPC/}, abstract = {CLeLfPC - Corpus de Lecture en Langue française Parlée Complétée, is a corpus of read French cued speech. Every year, the ALPC organizes training programmes designed to improve the qualifications of participants in coding LfPC. The corpus was then recorded during the summer 2021 session, in August 24-26 at “Les Karellis” (Savoie, France). See https://alpc.asso.fr/stage-2021-quarantiemes-rugissants/ for details about the event. All 23 recorded participants have voluntereed. There were 25-59 years old - average is 40; there are 5 men and 18 women. We have prepared a set of 10 topics. We asked participants to read aloud and to cue one topic; two participants accepted to read 2 different topics. Each topic was made of 4 sessions. The sessions were recorded separately for the participant to have a short break: 1. 32 isolated “CV” syllables; 2. 32 isolated words or phrases; 3. isolated sentences; 4. a text divided into 4-7 parts. The topics/sessions were carefully designed in order to cover a large amount of different keys and sequences of keys. We then recorded 25 sessions, and collected 4h of audio/video recordings.} } @Article{meunier2021sl, title = {Propriétés phonétiques et lexicales dans les variations temporelles des phonèmes en parole conversationnelle}, author = {Christine Meunier and Brigitte Bigi}, journal = {Studii de lingvistică}, year = {2021}, pages = {11--38}, volume = {11}, abstract = {Cette étude vise à clarifier certaines propriétés phonétiques et lexicales entraînant de fortes variations temporelles en parole spontanée. Trois cohortes de durée (SHORT, STANDARD et LONG) sont extraites d’un corpus de parole conversationnelle en français. Si les liquides et les glissantes sont massivement représentées dans SHORT, on observe une forte distorsion temporelle des voyelles tendant vers l’allongement. A l’inverse, les fricatives et occlusives représentent des supports temporels stables. Certains mots fonction tendent à favoriser la réduction alors que l’allongement des phonèmes est dû, en grande partie, à leur présence dans des items fréquents et spécifiques à la langue orale (marqueurs discursifs, interjection). La durée de certains phonèmes (//, //, //, //) est fortement conditionnée par leurs supports lexicaux ou communicationnels. Les durées phonémiques sont donc déterminées, entre autres, par la nature des phonèmes et par les propriétés du lexique et de son usage en conversation.}, note = {https://hal.archives-ouvertes.fr/hal-03509380/} } @Article{bigi2021lnai, title = {Resources for Automated Speech Segmentation of the African Language Naija (Nigerian Pidgin)}, author = {Brigitte Bigi and Abiola S. Oyelere and Bernard Caron}, journal = {Human Language Technology. Challenges for Computer Science and Linguistics, LNAI 12598}, pages={164--173}, year={2021}, volume = {}, ISBN = {978-3-030-66527-2}, publisher={Springer International Publishing}, abstract = {The development of HLT tools inevitably involves the need for language resources. However, only a handful number of languages possesses such resources. This paper presents the development of HLT tools for the African language Naija (Nigerian Pidgin), spoken in Nigeria. Particularly, this paper is focusing on developing language resources for a tokenizer, an automatic speech system for predicting the pronunciation of the words and their segmentation. The newly created resources are integrated into SPPAS software tool and distributed under the terms of public licenses.}, url = {https://link.springer.com/chapter/10.1007/978-3-030-66527-2_12}, note = {https://halshs.archives-ouvertes.fr/halshs-03097325} } @InProceedings{lancien2020lrec, author = {Mélanie Lancien and Marie-Hélène Côté and Brigitte Bigi}, title = {Developing Resources for Automated Speech Processing of Quebec French}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, year = {2020}, address = {Marseille, France}, publisher = {European Language Resources Association}, pages = {5323--5328}, abstract = {The analysis of the structure of speech nearly always rests on the alignment of the speech recording with a phonetic transcription. Nowadays several tools can perform this speech segmentation automatically. However, none of them allows the automatic segmentation of Quebec French (QF hereafter), the acoustics and phonotactics of QF differing widely from that of France French (FF hereafter). To adequately segment QF, features like diphthongization of long vowels and affrication of coronal stops have to be taken into account. Thus acoustic models for automatic segmentation must be trained on speech samples exhibiting those phenomena. Dictionaries and lexicons must also be adapted and integrate differences in lexical units and in the phonology of QF. This paper presents the development of linguistic resources to be included into SPPAS software tool in order to get Text normalization, Phonetization, Alignment and Syllabification. We adapted the existing French lexicon and developed a QF-specific pronunciation dictionary. We then created an acoustic model from the existing ones and adapted it with 5 minutes of manually time-aligned data. These new resources are all freely distributed with SPPAS version 2.7; they perform the full process of speech segmentation in Quebec French.}, url = {https://www.aclweb.org/anthology/2020.lrec-1.655}, note = {https://hal.archives-ouvertes.fr/hal-03042864} } @InProceedings{rauchbauer2020lrec, author = {Birgit Rauchbauer and Youssef Hmamouche and Brigitte Bigi and Laurent Prévot and Magalie Ochs and Thierry Chaminade}, title = {Multimodal Corpus of Bidirectional Conversation of Human-human and Human-robot Interaction during fMRI Scanning}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, year = {2020}, address = {Marseille, France}, publisher = {European Language Resources Association}, pages = {668--675}, abstract = {In this paper we present investigation of real-life, bi-directional conversations. We introduce the multimodal corpus derived from these natural conversations alternating between human-human and human-robot interactions. The human-robot interactions were used as a control condition for the social nature of the human-human conversations. The experimental set up consisted of conversations between the participant in a functional magnetic resonance imaging (fMRI) scanner and a human confederate or conversational robot outside the scanner room, connected via bidirectional audio and unidirectional videoconferencing (from the outside to inside the scanner). A cover story provided a framework for natural, real-life conversations about images of an advertisement campaign. During the conversations we collected a multimodal corpus for a comprehensive characterization of bi-directional conversations. In this paper we introduce this multimodal corpus which includes neural data from functional magnetic resonance imaging (fMRI), physiological data (blood flow pulse and respiration), transcribed conversational data, as well as face and eye-tracking recordings. Thus, we present a unique corpus to study human conversations including neural, physiological and behavioral data.}, url = {https://www.aclweb.org/anthology/2020.lrec-1.84} } @InProceedings{priego2020lrec, author = {Béatrice Priego-Valverde and Brigitte Bigi and Mary Amoyal}, title = {"Cheese!": a Corpus of Face-to-face French Interactions. A Case Study for Analyzing Smiling and Conversational Humor}, booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference}, year = {2020}, address = {Marseille, France}, publisher = {European Language Resources Association}, pages = {467--475}, abstract = {Cheese! is a conversational corpus. It consists of 11 French face-to-face conversations lasting around 15 minutes each. Cheese! is a duplication of an American corpus (ref) in order to conduct a cross-cultural comparison of participants’ smiling behavior in humorous and non-humorous sequences in American English and French conversations. In this article, the methodology used to collect and enrich the corpus is presented: experimental protocol, technical choices, transcription, semi-automatic annotations, manual annotations of smiling and humor. An exploratory study investigating the links between smile and humor is then proposed. Based on the analysis of two interactions, two questions are asked: (1) Does smile frame humor? (2) Does smile has an impact on its success or failure? If the experimental design of Cheese! has been elaborated to study specifically smiles and humor in conversations, the high quality of the dataset obtained, and the methodology used are also replicable and can be applied to analyze many other conversational activities and other multimodal modalities.}, url = {https://www.aclweb.org/anthology/2020.lrec-1.59}, note = {https://hal.archives-ouvertes.fr/hal-02565645} } @InProceedings{meunier2020jep, author = {Christine Meunier and Morgane Peirolo and Brigitte Bigi}, title = {La mobilisation du tractus vocal est-elle variable selon les langues en parole spontan{\'e}e ?}, booktitle = {Journ{\'e}es d'{\'E}tudes sur la Parole, 33e {\'e}dition)}, address = {Nancy, France}, EDITOR = {Benzitoun, Christophe and Braud, Chlo{\'e} and Huber, Laurine and Langlois, David and Ouni, Slim and Pogodalla, Sylvain and Schneider, St{\'e}phane}, PUBLISHER = {ATALA}, pages = {433-441}, year = {2020}, abstract = {L'objectif de ce travail est de quantifier les positions articulatoires théoriques lors de la production de la parole spontanée dans trois langues. Chaque langue dispose d’un inventaire phonologique spécifique. Mais ces spécificités ne sont pas représentées telles quelles en parole spontanée dans laquelle les phonèmes n’ont pas tous la même fréquence d’apparition. Nous avons comparé trois langues (polonais, français et anglais américain) présentant des différences notables dans leur inventaire phonologique. Des positions articulatoires ont été calculées sur la base des fréquences des phonèmes dans chacune des trois langues dans des corpus de parole spontanée. Etonnamment, les résultats tendent à montrer que les positions articulatoires majoritaires sont très similaires dans les trois langues. Il semble ainsi que l’usage de la parole spontanée, et donc la distribution des phonèmes dans les langues, gomme les disparités des systèmes phonologiques pour tendre vers une mobilisation articulatoire commune. Des investigations plus approfondies devront vérifier cette observation. }, note = {https://hal.archives-ouvertes.fr/hal-02798569v3/} } @misc{bigi2019lt4all, author = {Brigitte Bigi}, title = {Automated Speech Segmentation: Example of an African Language}, url = {http://sppas.org/bigi/Doc/bigi2019lt4all.pdf}, year = {2019}, address = {https://lt4all.org/en/, UNESCO Headquarters, Paris}, howpublished = {Poster}, } @InProceedings{bigi2019ltc, author = {Brigitte Bigi and Béatrice {Priego-Valverde}}, title = {Search for Inter-Pausal Units: application to Cheese! corpus}, booktitle = {9th Language & Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, address = {Poznań, Poland}, year = {2019}, pages = {289-293}, isbn = {978-83-65988-30-0}, abstract = {The development of corpora inevitably involves the need for segmentation. For most of the corpora, the first segmentation to operate consist in determining silences vs Inter-Pausal Units - i.e. sounding segments. This paper presents the "Search for IPUs" feature included in SPPAS - the automatic annotation and analysis of speech software tool distributed under the terms of public licenses. Particularly, this paper is focusing on the use and the evaluation of this feature on Cheese! corpus, a corpus of read then conversational speech between two participants. The paper reports the number of manual actions that are required for a user to check the automatic annotation: add new IPUs, ignore un-relevant ones, move boundaries, etc. Such evaluation validates the proposed method.}, url = {http://sppas.org/bigi/Doc/bigi2019ltcpwt.pdf}, note = {https://hal.archives-ouvertes.fr/hal-02428485} } @InProceedings{bigi2019ltcdemo1, author = {Brigitte Bigi}, title = {Filtering multi-levels annotated data}, booktitle = {9th Language & Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, address = {Poznań, Poland}, year = {2019}, pages = {13-14}, isbn = {978-83-65988-30-0}, abstract = {More and more annotated corpora are now available, and so are tools to annotate automatically and/or manually. As large multimodal-multiparty corpora become prevalent, new annotation and analysis requirements are emerging. Multimodal annotations are commonly organized in {\it tiers} each of which is a collection of annotations, each of which is commonly made of an anchor in time and a label. In this demo, we present DataFilter feature. It allows to define a set of filters to create new tiers with only the annotations matching the given filters. The system proposed in this demo is implemented as part of SPPAS software tool (Bigi, 2015), distributed under the terms of the GNU Public License.}, url = {http://sppas.org/bigi/Doc/bigi2019ltcdemo1.pdf}, note = {https://hal.archives-ouvertes.fr/hal-02428491} } @InProceedings{bigi2019ltcdemo2, author = {Katarzyna Klessa and Maciej Karpiński and Brigitte Bigi}, title = {Annotation and annotation mining tools for analyzing speech prosody in the Polish-German Borderland database}, booktitle = {9th Language & Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, address = {Poznań, Poland}, year = {2019}, pages = {15-16}, abstract = {In this demo presentation, we discuss tools and techniques developed within Borderland: a project dedicated to the analysis of paralinguistic phenomena in the conversations of Polish and German teenagers, entitled: Language of Boundaries and Boundaries of Language (see more at: http://borderland.amu.edu.pl/).}, url = {http://sppas.org/bigi/Doc/bigi2019ltcdemo2.pdf}, note = {https://hal.archives-ouvertes.fr/hal-02428496} } @Article{priego2018, author = {Béatrice {Priego-Valverde} and Brigitte Bigi and Salvatore Attardo and Lucy Pickering and Elisa Gironzetti}, title = {Is smiling during humor so obvious? A cross-cultural comparison of smiling behavior in humorous sequences in American English and French interactions}, journal = {Intercultural Pragmatics}, volume = {15}, number = {4}, pages = {563–591}, year = {2018}, doi = {https://doi.org/10.1515/ip-2018-0020}, abstract = {The present article is part of a larger cross-cultural research project on speaker-hearer smiling behavior in humorous and non-humorous conversations in American English and French. The American corpus consists of eight computer-mediated interactions between English native speakers, and the French one consists of four face-to-face interactions between French native speakers. The goal of the study is twofold: first, we analyze the link between smiling and humor, focusing on the degree of synchronicity of smiling and the intensity of smiling during humorous and non-humorous segments; second, we investigate the various targets mobilized in conversational humor. The results obtained comparing the two data-sets show a correlation between the presence of humor, an increased smiling intensity, and an increase in the synchronized smiling behaviors displayed by participants. However, the two corpora also differ in terms of the displayed smiling behaviors: French participants display more non-synchronic smiling when humor is absent and more synchronic smiling when humor is present. Regarding the various targets of humor (Speaker, Recipient, Other person, Situation, Speaker+Recipient), while their distribution is different – it is more evenly distributed in the French data – the way in which these are mobilized in order to become humorous is quite similar.}, note = {https://hal.archives-ouvertes.fr/hal-01923442} } @Article{bigi2018, author = {Brigitte Bigi and Christine Meunier}, title = {Automatic segmentation of spontaneous speech}, journal = {Revista de Estudos da Linguagem. International Thematic Issue: Speech Segmentation}, editors = {Tommaso Raso, Heliana Mello, Plinio Barbosa}, volume = {26}, number = {4}, ISBN = {e-ISSN 2237-2083}, year = {2018}, abstract = {Most of the time, analyzing the phonetic entities of speech requires the alignment of the speech recording with its phonetic transcription. However, studies on automatic segmentation have predominantly been carried out on read speech or on prepared speech while spontaneous speech refers to a more informal activity, without any preparation. As a consequence, in spontaneous speech numerous phenomena occur such as hesitations, repetitions, feedback, backchannels, non-standard elisions, reduction phenomena, truncated words, and more generally, non-standard pronunciations. Events like laughter, noises and filled pauses are also very frequent in spontaneous speech. This paper aims to compare read speech and spontaneous speech in order to evaluate the impact of speech style on a speech segmentation task. This paper describes the solution implemented into the SPPAS software tool to automatically perform speech segmentation of read and spontaneous speech. This solution consists mainly in two sorts of things: supporting an Enriched Orthographic Transcription for an optimization of the grapheme-to-phoneme conversion and allowing the forced-alignment of the following events: filled pauses, laughter and noises. Actually, these events represent less than 1 % of the tokens in read speech and about 6 % in spontaneous speech. They occur in a maximum of 3 % of the Inter-Pausal Units of a read speech corpus and from 20 % up to 36 % of the Inter-Pausal Units in the spontaneous speech corpora. The UBPA measure – Unit Boundary Positioning Accuracy, of the proposed forced-alignment system is 96.09 % accurate as regards read speech and 96.48 % for spontaneous speech with a delta range of 40 ms.}, url = {http://www.periodicos.letras.ufmg.br/index.php/relin/article/view/13026}, note = {https://hal.archives-ouvertes.fr/hal-01908434} } @InProceedings{bigi-jep2018, author = {Brigitte Bigi and Christine Meunier}, title = {euh, rire et bruits en parole spontanée : application à l'alignement forcé}, address = {Aix-en-Provence, France}, booktitle = {XXXIIe Journées d’Études sur la Parole}, pages = {648-656}, doi = {10.21437/JEP.2018-74}, url = {http://dx.doi.org/10.21437/JEP.2018-74}, note = {https://hal.archives-ouvertes.fr/hal-01959445}, year = 2018, } @Article{bigi2018lincei, title = {Annotation representation and file conversion tool}, author = {Brigitte Bigi}, journal = {Contributi del Centro Linceo Interdisciplinare ‘Beniamino Segre’ (ISSN 0394-0705)}, url = {http://sppas.org/bigi/Doc/bigi2017lincei.pdf}, year = {2018}, volume = {137}, ISBN = {978-88-218-1165-4, ISSN: 0394-0705}, pages = {99-116}, abstract = {Annotating corpora is of crucial importance in Corpus Linguistics. Linguistics annotation, especially when dealing with multiple domains, makes use of different tools within a given project. More and more annotated corpora are now available, and so are tools to annotate automatically and/or manually. Due to the diversity of linguistic phenomena, annotation tools lead to a variety of models, theories and formalisms. This diversity results in heterogeneous description formats, each tool developing its own framework. Then, none of the annotation tools are directly interoperable, each one using a native format, some of them on top of XML, some others developing an ad hoc markup language. The mapping between user formats is then an important issue. This paper presents an efficient annotation representation framework and the related tool to convert from/to some of the existing annotation file formats of various annotation software tools for audio recordings.}, note = {https://hal.archives-ouvertes.fr/hal-01908449} } @InProceedings{bigi2017ltc, title = {Developing Resources for Automated Speech Processing of the African Language Naija (Nigerian Pidgin)}, author = {Brigitte Bigi and Bernard Caron and Abiola S. Oyelere}, booktitle = {8th Language and Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, address = {Poznań, Poland}, pages = {441-445}, year = {2017}, abstract = {The development of HLT tools inevitably involves the need for language resources. However, only a handful number of languages possesses such resources. This paper presents the development of HLT tools for the African language Naija (Nigerian Pidgin), spoken in Nigeria. Particularly, this paper is focusing on developing language resources for a tokenizer, an automatic speech system for predicting the pronunciation of the words and their segmentation. The newly created resources are integrated into SPPAS software tool and distributed under the terms of public licenses.}, url = {http://sppas.org/bigi/Doc/bigi2017ltc.pdf}, note = {https://hal.archives-ouvertes.fr/hal-01705707/document} } @misc{bigi2016barcelona, author = {Brigitte Bigi}, title = {Introduction to SPPAS}, year = {2016}, address = {Departament de Filologia Catalana i Lingüística Ganeral, Universitat de Barcelona, Catalunya}, howpublished = {Seminar} } @misc{bigi2016corli, author = {Brigitte Bigi}, title = {Annnotation automatique et analyse de corpus avec SPPAS}, url = {http://sppas.org/tutorial.html}, year = {2016}, address = {CORLI - Corpus, Langues et Interactions, Lyon, France}, howpublished = {Seminar} } @misc{bigi2016lpl, author = {Brigitte Bigi}, title = {Multi-Lingual approaches to the automatic annotation of speech}, url = {http://sppas.org/bigi/Doc/2016-LPL-PAC/}, year = {2016}, address = {The Phonology of Contemporary English, LPL, Aix-en-Provence, France}, howpublished={Seminar} } @article{bigi2016lnai, author = "Brigitte Bigi", title = {A phonetization approach for the forced-alignment task in {SPPAS}}, journal= {Human Language Technology. Challenges for Computer Science and Linguistics, LNAI 9561}, pages = {515--526}, year = {2016}, volume = {}, ISBN = {978-3-319-43807-8}, publisher={Springer Berlin Heidelberg}, url = {http://link.springer.com/chapter/10.1007%2F978-3-319-43808-5_30}, note = {https://hal.archives-ouvertes.fr/hal-01455223} } @misc{priego2016, author = "Béatrice {Priego-Valverde} and Brigitte Bigi", title = {Smiling behavior in humorous and non humorous conversations: a preliminary cross-cultural comparison between American English and French}, howpublished = {Oral presentation}, booktitle = {International Society for Humor Studies Conference}, address = {Dublin, Ireland}, year = {2016}, note = {https://hal.archives-ouvertes.fr/hal-01455222} } @InProceedings{bigi2016lrec, author = {Brigitte Bigi and Roxane Bertrand}, title = {Laughter in French Spontaneous Conversational Dialogs}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, pages = {2168-2174}, abstract = {This paper presents a quantitative description of laughter in height 1-hour French spontaneous conversations. The paper includes the raw figures for laughter as well as more details concerning inter-individual variability. It firstly describes to what extent the amount of laughter and their durations varies from speaker to speaker in all dialogs. In a second suite of analyses, this paper compares our corpus with previous analyzed corpora. In a final set of experiments, it presents some facts about overlapping laughs. This paper have quantified these all effects in free-style conversations, for the first time.}, url = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/67_Paper.pdf}, note = {https://hal.archives-ouvertes.fr/hal-01462176/} } @InProceedings{meunier2016lrec, author = {Christine Meunier and Cécile Fougeron and Corinne Fredouille and Brigitte Bigi and Lise Crevier-Buchman and Elisabeth Delais-Roussarie and Laurianne Georgeton and Alain Ghio and Imed Laaridh and Thierry Legou and Claire Pillot-Loiseau and Gilles Pouchoulin}, title = {The TYPALOC Corpus: A Collection of Various Dysarthric Speech Recordings in Read and Spontaneous Styles}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, location = {Portorož, Slovenia}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, address = {Paris, France}, isbn = {978-2-9517408-9-1}, pages = {4658-4665}, abstract = {This paper presents the TYPALOC corpus of French Dysarthric and Healthy speech and the rationale underlying its constitution. The objective is to compare phonetic variation in the speech of dysarthric vs. healthy speakers in different speech conditions (read and unprepared speech). More precisely, we aim to compare the extent, types and location of phonetic variation within these different populations and speech conditions. The TYPALOC corpus is constituted of a selection of 28 dysarthric patients (three different pathologies) and of 12 healthy control speakers recorded while reading the same text and in a more natural continuous speech condition. Each audio signal has been segmented into Inter-Pausal Units. Then, the corpus has been manually transcribed and automatically aligned. The alignment has been corrected by an expert phonetician. Moreover, the corpus benefits from an automatic syllabification and an Automatic Detection of Acoustic Phone-Based Anomalies. Finally, in order to interpret phonetic variations due to pathologies, a perceptual evaluation of each patient has been conducted. Quantitative data are provided at the end of the paper.}, url = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/555_Paper.pdf}, note = {https://halshs.archives-ouvertes.fr/halshs-01401377} } @InProceedings{meunier2016jep, author = {Christine Meunier and Brigitte Bigi}, title = {Répartition des phonèmes réduits en parole conversationnelle. Approche quantitative par extraction automatique}, booktitle = {Journées d'Etudes sur la Parole (JEP)}, address = {Paris, France}, year = 2016, abstract = {Cette étude vise à mieux comprendre la répartition des réductions phonétiques présentes dans la production de parole. Nous avons sélectionné l'ensemble des phonèmes les plus courts (30ms) à partir de l'alignement d'un corpus de parole conversationnelle. Cette version contenant uniquement les phonèmes courts (V1) est comparée à la version contenant l'alignement de tous les phonèmes du corpus (V0). Les deux versions sont mises en relation avec l'annotation des mots et de leur catégorie syntaxique. Les résultats montrent que les liquides, les glissantes et les voyelles fermées sont plus représentées dans V1 que dans V0. Par ailleurs, la nature et la catégorie syntaxique des mots modulent la distribution des phonèmes en V1. Ainsi, la nature instable du /l/, ainsi que sa présence dans de très nombreux pronoms et déterminants, en fait le phonème le plus marqué par la réduction. Enfin, la fréquence des mots semble montrer des effets contradictoires.}, note = {https://hal.archives-ouvertes.fr/hal-01462228} } @article{bigi2015phonetician, title = {SPPAS - Multi-lingual Approaches to the Automatic Annotation of Speech}, author = {Brigitte Bigi}, publisher = {International Society of Phonetic Sciences}, journal = {The Phonetician}, volume = {111--112}, pages = {54--69}, isbn = {0741-6164}, year = {2015}, abstract = {The first step of most acoustic analyses unavoidably involves the alignment of recorded speech sounds with their phonetic annotation. This step is very labor- intensive and cost-ineffective since it has to be performed manually by experienced phoneticians during many hours of work. This paper describes the main features of SPPAS, a software tool designed for the needs of automatically producing annotations of speech at the level of utterance, word, syllable and phoneme based on the recorded speech sound and its orthographic transcription. In other words, it can automatize the phonetic transcription task for speech materials, as well as the alignment task of transcription with speech recordings for further acoustic analyses. Special attention will be given to the methodology implemented in SPPAS, based on algorithms which are as language-and-task-independent as possible. This procedure allows for the addition of new languages quickly and for the adaptation of this tool to the user's specific needs. Consequently, the quality of the automatic annotations is largely influenced by external resources, and the users can modify the process as needed. In that sense, phoneticians need automatic tools and these tools can be significantly improved by phonetician input. }, note={https://hal.archives-ouvertes.fr/hal-01417876} } @inproceedings{fung2015ococosda, title = {Automatic word segmentation for spoken Cantonese}, author = {Roxana Fung and Brigitte Bigi}, booktitle={Oriental COCOSDA and Conference on Asian Spoken Language Research and Evaluation (O-COCOSDA/CASLRE)}, pages = {196--201}, year = {2015}, doi = {10.1109/ICSDA.2015.7357891}, organization={IEEE}, doi = {10.1109/ICSDA.2015.7357891}, abstract = {Though Cantonese is the most influential variety of Chinese other than Mandarin, there are only a limited number of Cantonese corpora available for linguistic studies. Among the essential steps of building a corpus, word segmentation is a necessary but highly challenging task due to the lack of clear word boundary in Cantonese. This paper reports the construction and evaluation of an open-source automatic Cantonese word segmenter developed for Cantonese. The tool is a component of the multilingual SPPAS program designed to be used directly by linguists. It is a free software distributed under a GPL license. The effectiveness of the tool was evaluated by comparing the result of segmenting some samples of a spoken Cantonese corpus manually and automatically using the tool developed. High precision and recall were found in our study. Upon completion, the tool would definitely promote the development of more Cantonese corpora for language related studies.}, url = {http://sppas.org/bigi/Doc/bigi2015ococosda.pdf}, note = {https://hal.archives-ouvertes.fr/hal-01455312} } @inproceedings{bigi2015ltc, title={Automatic Syllabification of Polish}, author={Brigitte Bigi and Katarzyna Klessa}, booktitle={7th Language and Technology Conference: Human Language Technologies as a Challenge for Computer Science and Linguistics}, address={Poznan, Poland}, pages = {262--266}, year=2015, abstract={This paper presents an approach to automatic detection of syllable boundaries for Polish speech based on a phonetized text input. First, we discuss selected issues of syllable structure in Polish with a special focus on the needs of rule-based automatic insertion of syllable boundaries. We summarize and verify an existing rule-set for Polish, which is subsequently used as an input information for automatic syllabification with SPPAS, a freely available multiplatform software tool. Then, the applied syllabification methodology is described and illustrated with examples obtained with a Polish speech corpus. Finally, the paper provides information about the syllabification module for Polish that has been implemented as one of the latest extensions of SPPAS.}, url = {http://ltc.amu.edu.pl/book/papers/SPEECH3-3.pdf}, } @inproceedings{bigi2015interspeech, title={A syllable-based analysis of speech temporal organization: a comparison between speaking styles in dysarthric and healthy populations}, author={Brigitte Bigi and Katarzyna Klessa and Laurianne Georgeton and Christine Meunier}, booktitle={Sixteenth Annual Conference of the International Speech Communication Association}, address = {Dresden, Germany}, year={2015}, pages = {}, abstract = {A comparison of how healthy and dysarthric pathological speakers adapt their production is a way to better understand the processes and constraints that interact during speech production in general. The present study focuses on spontaneous speech obtained with varying recording scenarios from five different groups of speakers. Patients suffering from motor speech disorder (dysarthria) affecting speech production are compared to healthy speakers. Three types of dysarthria have been explored: Parkinson's Disease, Amyotrophic Lateral Sclerosis and Cerebellar ataxia. This paper first presents general figures based on syllable-level annotation mining, including detailed information about healthy/pathological speakers variability. Then, we report on the results of automatic timing parsing of interval sequences in speech syllable annotations performed using TGA (Time Group Analysis) methodology. We observed that mean syllable-based speaking rates in time groups for the healthy speakers were higher than those measured in the recordings of dysarthric speakers. The variability in timing patterns (duration regression slopes, intercepts, and nPVI) depended also on the speaking styles in particular populations.}, url = {http://sppas.org/bigi/Doc/bigi2015interspeech.pdf} } @InProceedings{bigi2015gespin, title = {Searching and retrieving multi-levels annotated data}, author = {Brigitte Bigi and Jorane Saubesty}, booktitle={Proceedings of Gesture and Speech in Interactioni - 4th edition}, address ={Nantes, France}, year ={2015}, pages = {31-36}, note = {https://hal.archives-ouvertes.fr/hal-01195646} } @InProceedings{prevot2015sigdial, title={A SIP of CoFee: A Sample of Interesting Productions of Conversational Feedback}, author={Laurent Prévot and Jan Gorisch and Roxane Bertrand and Emilien Gorene and Brigitte Bigi}, booktitle={16th Annual Meeting of the Special Interest Group on Discourse and Dialogue}, pages={149--153}, year={2015}, adddress = {Prague, Czech Republic}, url = {http://www.aclweb.org/anthology/W/W15/W15-46.pdf#page=169} } @misc{bigi2015lpl, author = {Brigitte Bigi}, title = {Uncertainty-tolerant framework for multimodal corpus annotation}, url = "http://sppas.org/bigi/Doc/seminaire-6fevrier2015/", year = {2015}, howpublished={Seminar} } @misc{bigi2015tutoupf, author={Brigitte Bigi}, title={Semi-automatic multimodal annotation (of conversational data)}, howpublished={Tutorial}, url={http://www.upf.edu/dtcl/activitats/cartells_pdf/Tutorials_7-8_april.pdf}, title={Tutorials on Empirical Methods in Language Research}, address={Universitat Pompeu Fabra, Barcelona, Spain}, year={2015}, organizer={Master in Theoretical and Applied Linguistic} } @misc{bigi2015tutointerspeech, author={Brigitte Bigi, Daniel Hirst, Dafydd Gibbon}, title={Methodology and software for the semi-automatic annotation and analysis of speech: Human Language Technology meets Linguists}, address={Dresden, Germany}, howpublished={Tutorial}, organizer={Interspeech}, year={2015}, url={http://sppas.org/bigi/Doc/2015-Interspeech-Tutorial/} } @misc{bigi2015tutohk1, author={Brigitte Bigi}, title={SPPAS Tutorial: Methodology and software for the semi-automatic annotation of speech}, address = {Tutorial at Poly U, Hong Kong}, howpublished = {Tutorial}, year = {2015}, url = {http://sppas.org/bigi/Doc/2015-SPPAS-Tutorial-HongKong/} } @techreport{bigi2015tutohk2, author={Brigitte Bigi}, title={Methodology and software for the semi-automatic annotation of speech}, year = {2015}, institution = {Tutorial at Poly U, Hong Kong}, url = {http://sppas.org/bigi/Doc/2015-SPPAS-Tutorial-HongKong/pdf/SPPAS-tutorial-Handout.pdf}, howpublished = {Documentation} } @misc{bigi2015ircom, author={Brigitte Bigi}, title={Annotation automatique et analyse de corpus avec SPPAS}, howpublished={Tutorial}, organizer={Consortium Corpus Oraux et Multimodaux d'Huma-num}, year={2015}, url={http://ircom.huma-num.fr} } @article{bigi2014lnai, title={A Multilingual Text Normalization Approach}, author={Brigitte Bigi}, journal={Human Language Technology Challenges for Computer Science and Linguistics, LNAI 8387}, pages={515--526}, year={2014}, volume = {}, ISBN = {978-3-319-14120-6}, publisher={Springer Berlin Heidelberg}, url = {http://link.springer.com/chapter/10.1007/978-3-319-08958-4_42} } @InProceedings{bigi2014lrec, author = {Brigitte Bigi and Tatsuya Watanabe and Laurent Prévot}, title = {Representing Multimodal Linguistic Annotated Data}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, year = {2014}, address = {Reykjavik, Iceland}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-8-4}, pages = {3386-3392}, url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/51_Paper.pdf}, abstract = {The question of interoperability for linguistic annotated resources covers different aspects. First, it requires a representation framework making it possible to compare, and eventually merge, different annotation schema. In this paper, a general description level representing the multimodal linguistic annotations is proposed. It focuses on time representation and on the data content representation: This paper reconsiders and enhances the current and generalized representation of annotations. An XML schema of such annotations is proposed. A Python API is also proposed. This framework is implemented in a multi-platform software and distributed under the terms of the GNU Public License.} } @InProceedings{bigi2014lrecor, author = {Brigitte Bigi and Roxane Bertrand and Mathilde Guardiola}, title = {Automatic Detection of Other-Repetition Occurrences: Application to French Conversational Speech}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, year = {2014}, address = {Reykjavik, Iceland}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-8-4}, pages = {836-842}, url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/71_Paper.pdf}, abstract = {This paper investigates the discursive phenomenon called other-repetitions (OR), particularly in the context of spontaneous French dialogues. It focuses on their automatic detection and characterization. A method is proposed to retrieve automatically OR: this detection is based on rules that are applied on the lexical material only. This automatic detection process has been used to label other-repetitions on 8 dialogues of CID - Corpus of Interactional Data. Evaluations performed on one speaker are good with a F1-measure of 0.85. Retrieved OR occurrences are then statistically described: number of words, distance, etc.} } @InProceedings{gorisch2014lrec, author = {Jan Gorisch and Corine Astésano and Ellen Gurman Bard and Brigitte Bigi and Laurent Prévot}, title = {Aix Map Task Corpus: the French Multimodal Corpus of Task-oriented Dialogue}, booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC'14)}, year = {2014}, address = {Reykjavik, Iceland}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Hrafn Loftsson and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-8-4}, pages = {2648-2652}, url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/719_Paper.pdf}, abstract = {This paper introduces the Aix Map Task corpus, a corpus of audio and video recordings of task-oriented dialogues. It was modelled after the original HCRC Map Task corpus. Lexical material was designed for the analysis of speech and prosody, as described in Astésano et al. (2007). The design of the lexical material, the protocol and some basic quantitative features of the existing corpus are presented. The corpus was collected under two communicative conditions, one audio-only condition and one face-to-face condition. The recordings took place in a studio and a sound attenuated booth respectively, with head-set microphones (and in the face-to-face condition with two video cameras). The recordings have been segmented into Inter-Pausal-Units and transcribed using transcription conventions containing actual productions and canonical forms of what was said. It is made publicly available online.} } @InProceedings{bigi2014apclc, author = {Brigitte Bigi}, year = {2014}, title = {Automatic Speech Segmentation of French: Corpus Adaptation}, booktitle = {2nd Asian Pacific Corpus Linguistics Conference}, pages = {32}, address = {Hong Kong}, url = {http://sppas.org/bigi/Doc/bigi2014apclc-slides.pdf} } @InProceedings{bigi2014evalita, author = {Brigitte Bigi}, year = {2014}, title = {The {SPPAS} participation to Evalita 2014}, booktitle = {Proceedings of the First Italian Conference on Computational Linguistics CLiC-it 2014 and the Fourth International Workshop EVALITA 2014}, address = {Pisa, Italy}, editor = {R. Basili, A. Lenci, B. Magnini}, ISBN = {978-886741-472-7}, volume = {2}, pages = {127--130} url = "http://sppas.org/bigi/Doc/bigi2014evalita.pdf", } @InProceedings{bigi2014clic, title={A generic tool for the automatic syllabification of Italian}, author={Brigitte Bigi and Caterina Petrone}, booktitle={Proceedings of the First Italian Conference on Computational Linguistics CLiC-it 2014 and of the Fourth International Workshop EVALITA 2014}, pages={73--77}, year={2014}, volume = {1}, address = {Pisa, Italy}, editor = {R. Basili, A. Lenci, B. Magnini}, ISBN = {978-886741-472-7}, organization={Pisa University Press}, url = {http://www.fileli.unipi.it/projects/clic/proceedings/vol1/CLICIT2014115.pdf} } @InProceedings{bigi2014larp, author = {Brigitte Bigi and Caterina Petrone and Leonardo Lancia}, year = {2014}, url = "http://sppas.org/bigi/Doc/bigi2014larp.pdf", title = {Automatic Syllabification of Italian: adaptation from French}, booktitle = {Laboratory Approaches to Romance Phonology VII}, pages = {38--40}, address ={Aix-en-Provence, France} } @InProceedings{bigi2014taln, title={Extraction de données orales multi-annotées}, author={Brigitte Bigi and Tatsuya Watanabe}, booktitle = {20èmes Traitement Automatique des Langues Naturelles}, address = {Marseille, France}, pages = {556--561}, year={2014}, isbn = {978-2-9518233-6-5}, url = {http://anthology.aclweb.org/F/F14/F14-2028.pdf} } @InProceedings{bigi2013ltc, title={A phonetization approach for the forced-alignment task}, author={Brigitte Bigi}, booktitle={3rd Less-Resourced Languages workshop and 6th Language and Technology Conference, Poznan (Poland)}, year={2013} } @InProceedings{prevot2013sigdial, title={A quantitative view of feedback lexical markers in conversational French}, author={Laurent Prévot and Brigitte Bigi and Roxane Bertrand}, booktitle={14th Annual Meeting of the Special Interest Group on Discourse and Dialogue}, pages={87--91}, year={2013}, editor = {Association for Computational Linguistics}, address = {Metz, France}, url = {https://halshs.archives-ouvertes.fr/file/index/docid/983704/filename/bigi2013sigdial.pdf} } @InProceedings{hirst2013trasp, title={Building OMProDat: an open multilingual prosodic database}, author={Daniel Hirst and Brigitte Bigi and Hyongsil Cho and Hongwei Ding and Sophie Herment and Ting Wang}, booktitle={Proceedings of TRASP, Tools and Resources for the Analysis of Speech Prosody, a satellite workshop of Interspeech}, address = {Aix-en-Provence, France}, pages={11--14}, year={2013}, isbn={978-2-7466-6443-2}, url = {http://www.lpl-aix.fr/~trasp/Proceedings/19767-trasp2013.pdf} } @InProceedings{bard2013trasp, title={Aix MapTask: A (rather) new French resource for prosodic and discourse studies}, author={Ellen Gurman Bard and Corine Astésano and Alice Turk and Mariapaola D'imperio and Noël Nguyen and Laurent Prévot and Brigitte Bigi}, booktitle={Proceedings of TRASP, Tools and Resources for the Analysis of Speech Prosody, a satellite workshop of Interspeech}, address = {Aix-en-Provence, France}, pages={15--19}, year={2013}, isbn={978-2-7466-6443-2}, url = {http://lpl-aix.fr/~trasp/Proceedings/20507-trasp2013.pdf} } @InProceedings{bigi2013trasp, title={What's new in SPPAS 1.5?}, author={Brigitte Bigi}, booktitle={Proceedings of TRASP, Tools and Resources for the Analysis of Speech Prosody, a satellite workshop of Interspeech}, address = {Aix-en-Provence, France}, pages={62--65}, year={2013}, isbn={978-2-7466-6443-2}, url = {http://lpl-aix.fr/~trasp/Proceedings/20354-trasp2013.pdf} } @inproceedings{tellier2013tiger, title={Gesturing While Pausing in Conversation: Self-oriented or Partner-oriented}, author={Marion Tellier and Gale Stam and Brigitte Bigi}, booktitle={Proceedings of the Tilburg Gesture Research Meeting (TIGER 2013)}, year={2013}, url={https://tiger.uvt.nl/pdf/papers/tellier.pdf} } @Article{bigi2013multimodal, title={A multimodal study of answers to disruptions}, author={Brigitte Bigi and Cristel Portes and Agnès Steuckardt and Marion Tellier}, journal={Journal on Multimodal User Interfaces}, volume={7}, number={1-2}, pages={55--66}, year={2013}, publisher={Springer Berlin Heidelberg}, isbn = {1783-7677}, doi = {10.1007/s12193-012-0110-z}, note = {https://hal.archives-ouvertes.fr/hal-00781549/}, abstract = {The interaction between Members of Parliament (MPs) is convention-based and rule-regulated. As instantiations of individual and group confrontations, parliamentary debates display well-regulated competing discursive processes. Unauthorised interruptions are spontaneous verbal reactions of MPs who interrupt the current speaker. This paper focuses on the answers of the current speaker to these disruptions. It introduces an annotation scheme for a political debate dataset which is mainly in the form of video annotations and audio annotations. The annotations contain information ranging from general linguistic to domain specific information. Some is annotated with automatic tools, and some is manually annotated. One of the goals is to use the information to predict the categories of the answers by the speaker to the disruptions. A typology of such answers is proposed and an automatic categorization system based on a multimodal parametrization is successfully performed.} } @Article{herment2012aixox, title={AixOx, a multi-layered learners' corpus: automatic annotation}, author={Sophie Herment and Anne Tortel and Brigitte Bigi and Daniel Hirst and Anastassia Loukina}, journal={Specialisation and Variation in Language Corpora. Linguistic Insights: Studies in Language and Communication}, editor = {Ana Díaz-Negrillo and Francisco Javier Díaz-Pérez}, pages = {41--76}, volume = {179}, isbn = {978-3-0343-1316-2}, year={2014} } @Article{bigi2012, author = {Brigitte Bigi}, title = {The {SPPAS} participation to Evalita 2011}, journal = {LNAI 7689}, year = {2012}, volume = {}, isbn = {2240-5186}, address = {Rome, Italy}, editor = {B. Magnini et al.}, pages = {312--321}, publisher={Springer Berlin Heidelberg}, note = {https://hal.archives-ouvertes.fr/hal-00983698/}, abstract = {SPPAS is a new tool to produce automatic annotations which include utterance, word, syllabic and phonemic segmentations from a recorded speech sound and its transcription. SPPAS is "user- friendly" and open source software issued under the GNU Public License. This paper describes SPPAS algorithms and Italian resources for phonetization and alignment and evaluations related to the "Forced Alignment on Spontaneous Speech" task of the Evalita 2011 campaign. SPPAS is based on a dictionary look-ups approach for the phonetization and the use of the grammar-based Julius engine for alignment. A grammar contains sets of predefined combinations of words and contains one or more representations of the distinct phones that make up each word.} } @InProceedings{bigi2012sp, title={SPeech Phonetization Alignment and Syllabification (SPPAS): a tool for the automatic analysis of speech prosody}, author={Brigitte Bigi and Daniel Hirst}, booktitle={Speech Prosody}, publisher = {Tongji University Press}, ISBN = {978-7-5608-4869-3}, pages = {19--22}, address = {Shanghai China}, year={2012}, note = {https://hal.archives-ouvertes.fr/hal-00983699/}, abstract = {SPPAS, SPeech Phonetization Alignment and Syllabification, is a tool to automatically produce annotations which include utterance, word, syllable and phoneme segmentations from a recorded speech sound and its transcription. SPPAS is currently implemented for French, English, Italian and Chinese and there is a very simple procedure to add other languages. The tool is developed for Unix based platforms (Linux, MaxOS and Cygwin on Windows) and is specifically designed to be used directly by linguists in conjunction with other tools for the automatic analysis of speech prosody. The tools will all be distributed under a GPL license. } } @InProceedings{bigi2012lrecsppas, author = {Brigitte Bigi}, title = {{SPPAS}: a tool for the phonetic segmentation of speech}, booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation}, year = {2012}, address = {Istanbul, Turkey}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-7-7}, pages = {1748--1755}, url = {http://lrec-conf.org/proceedings/lrec2012/pdf/1116_Paper.pdf}, abstract = {SPPAS is a tool to produce automatic annotations which include utterance, word, syllabic and phonemic segmentations from a recorded speech sound and its transcription. SPPAS is distributed under the terms of the GNU Public License. It was successfully applied during the Evalita 2011 campaign, on Italian map-task dialogues. It can also deal with French, English and Chinese and there is an easy way to add other languages. The paper describes the development of resources and free tools, consisting of acoustic models, phonetic dictionaries, and libraries and programs to deal with these data. All of them are publicly available.} } @InProceedings{bigi2012lrectrs, author = {Brigitte Bigi and Pauline Péri and Roxane Bertrand}, title = {Orthographic Transcription: which enrichment is required for phonetization?}, booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation}, year = {2012}, address = {Istanbul, Turkey}, editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Uğur Doğan and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis}, publisher = {European Language Resources Association (ELRA)}, isbn = {978-2-9517408-7-7}, pages = {1756--1763}, url = {http://lrec-conf.org/proceedings/lrec2012/pdf/1122_Paper.pdf} } @InProceedings{stam2012, title={Handling language: The gestures of future foreign language teachers}, author={Gale Stam and Marion Tellier and Brigitte Bigi}, booktitle = {First Wilga Rivers Colloquium on Foreign Language Pedagogy}, publisher = {American Association for Applied Linguistics}, address = {Boston, USA}, url = {http://digitalcommons.nl.edu/cgi/viewcontent.cgi?article=1001&context=faculty_publications}, year={2012} } @misc{tellier2012, author = {Marion Tellier, Gale Stam, Brigitte Bigi}, year = {2012}, title = {Same speech, different gestures?}, booktitle = {5th International Society for Gesture Studies}, address = {Lund, Sweden}, howpublished = {Poster} } @InProceedings{herment2012, author = {Sophie Herment and Anastassia Loukina and Anne Tortel and Daniel Hirst and Brigitte Bigi}, title = {A multi-layered learners corpus: automatic annotation}, booktitle = {4th International Conference on Corpus Linguistics Language, corpora and applications: diversity and change}, year = {2012}, address = {Jaén, Spain} } @InProceedings{bigi2012jep, title={Influence de la transcription sur la phonétisation automatique de corpus oraux}, author={Brigitte Bigi and Pauline Péri and Roxane Bertrand}, booktitle={Journées d'études sur la parole}, pages={449--456}, year={2012}, address = {Grenoble, France}, note = {https://hal.archives-ouvertes.fr/hal-00983743} } @misc{bigi2012jepsppas, title={SPPAS: segmentation, phonétisation, alignement, syllabation}, author={Brigitte Bigi}, booktitle={JEP-TALN-RECITAL}, pages={9--10}, year = {2012}, howpublished={Demo}, url = {http://www.aclweb.org/old_anthology/F/F12/F12-5005.pdf} } @InProceedings{bigi2012degelssppas, title={SPPAS: un outil user-friendly pour l’alignement texte/son}, author={Brigitte Bigi}, booktitle={Actes de la conférence conjointe JEP-TALN-RECITAL}, pages={85--92}, year={2012}, note = {http://www.aclweb.org/old_anthology/W/W12/W12-1207.pdf} } @misc{bigi2012paris, author={Brigitte Bigi}, title={Segmentation de la parole : méthode et logiciel}, url = {http://www.clillac-arp.univ-paris-diderot.fr/projets/charliphonia/2012}, howpublished={Tutorial}, address={Journées Charliphonia-Longdale III, Paris XIII}, year={2012} } @InProceedings{bigi2011icmi, author = {Brigitte Bigi and Cristel Portes and Agnès Steuckardt and Marion Tellier}, title = {Multimodal Annotations and Categorization for Political Debates}, booktitle = {ICMI Workshop on Multimodal Corpora for Machine learning}, year = {2011}, address = {Alicante, Spain}, note = {https://hal.archives-ouvertes.fr/hal-00983742/} } @InProceedings{tellier2011degels, author = {Marion Tellier and Mathilde Guardiola and Brigitte Bigi}, title = {Types de gestes et utilisation de l'espace gestuel dans une description spatiale : méthodologie de l'annotation}, booktitle = {Atelier DEGELS, 18èmes conférence annuelle Traitement Automatique des Langues Naturelles}, year = {2011}, pages = {45-56}, address = {Montpellier, France}, url = {http://sppas.org/bigi/Doc/bigi2011degels.pdf} } @misc{blache2011atala, author = {Philippe Blache and Roxane Bertrand and Brigitte Bigi and Robert Espesser and Mathilde Guardiola and Stéphane Rauzy}, title = {Une expérience d'annotation à large échelle : le projet OTIM}, booktitle = {Journée Atala}, year = {2011}, address = {Paris, France}, howpublished={Seminar}, url = {http://sppas.org/bigi/Doc/blache2011atala.pdf} } @InProceedings{guardiola2011, author = {Mathilde Guardiola and Béatrice {Priego-Valverde} and Brigitte Bigi and Roxane Bertrand}, title = {Other-repetitions in French face-to-face interactions as a device of conversational humor}, booktitle = {12th International Pragmatics Conference (IPRA)}, year = {2011}, address = {Manchester, UK}, } @InProceedings{bigi2011taln, author = {Brigitte Bigi and Cristel Portes and Agnès Steuckardt and Marion Tellier}, title = {Catégoriser les réponses aux interruptions dans les débats politiques}, booktitle = {18èmes conférence annuelle Traitement Automatique des Langues Naturelles}, year = {2011}, pages = {167--172}, address = {Montpellier, France} } @InProceedings{bigi2010lrec, author = {Brigitte Bigi and Christine Meunier and Irina Nesterenko and Roxane Bertrand}, title = {Automatic detection of syllable boundaries in spontaneous speech}, booktitle = {Language Resource and Evaluation Conference}, year = {2010}, pages = {3285--3292}, address = {La Valetta, Malta}, url = {http://lrec-conf.org/proceedings/lrec2010/pdf/219_Paper.pdf}, abstract = {This paper presents the outline and performance of an automatic syllable boundary detection system. The syllabification of phonemes is performed with a rule-based system, implemented in a Java program. Phonemes are categorized into 6 classes and specific rules are developed to deal with a French spontaneous speech corpus. Moreover, the proposed phonemes, classes and rules are listed in an external configuration file of the tool (under GPL licence). Finally, performances are evaluated and compared to state-of-the-art systems and show significant improvements} } @InProceedings{blache2010icgl, author = {Philippe Blache and Brigitte Bigi and Laurent Prévot and Stéphane Rauzy and Julien Seinturier}, title = {Annotation schemes, annotation tools and the question of interoperability: from Typed Feature Structures to XML Schemas}, booktitle = {Second International Conference on Global Interoperability for Language Resource}, year = {2010}, address = {Hong Kong}, url = {http://sppas.org/bigi/Doc/bigi2010icgl.pdf}, abstract = {The multiplication of annotation schemes and coding formats is a severe limitation for interoperability. We propose in this paper an approach specifying the annotation scheme in terms of typed feature structures, that are in a second step translated into XML schemas, from which data are encoded. This approach guarantees the fact that no information is lost when translating one format into another.} } @InProceedings{blache2010law, author = {Philippe Blache and Roxane Bertrand and Brigitte Bigi and Emmanuel Bruno and E. Cela and Robert Espesser and Gaëlle Ferré and Mathilde Guardiola and Daniel Hirst and E.-P. Magro and Jean-Claude Martin and Christine Meunier and Marie-Annick Morel and Elisabeth Murisasco and Irina Nesterenko and Pascal Nocera and Berthille Pallaud and Laurent Prévot and Béatrice {Priego-Valverde} and Julien Seinturier and Ning Tan and Marion Tellier and Stéphane Rauzy}, title = {Multimodal Annotation of Conversational Data}, booktitle = {The Fourth Linguistic Annotation Workshop}, year = {2010}, pages = {186--191}, isbn = {978-1-932432-72-5}, address = {Uppsala, Sueden}, url = {http://dl.acm.org/citation.cfm?id=1868720.1868749}, acmid = {1868749}, publisher = {Association for Computational Linguistics}, abstract = {We propose in this paper a broad-coverage approach for multimodal annotation of conversational data. Large annotation projects addressing the question of multimodal annotation bring together many different kinds of information from different domains, with different levels of granularity. We present in this paper the first results of the OTIM project aiming at developing conventions and tools for multimodal annotation.} } @InProceedings{bigi2010jepr, author = {Brigitte Bigi and Roxane Bertrand and Mathilde Guardiola}, title = {Recherche automatique d'hétéro-répétitions dans un dialogue oral spontané}, booktitle = {XVIIIèmes Journées d'Étude sur la Parole}, year = {2010}, address = {Mons, Belgium}, } @InProceedings{bigi2010jeps, author = {Brigitte Bigi and Christine Meunier and Irina Nesterenko and Roxane Bertrand}, title = {Annotation automatique en syllabes d'un dialogue oral spontané}, booktitle = {XVIIIèmes Journées d'Étude sur la Parole}, year = {2010}, address = {Mons, Belgium} } @InBook{demori2009, author = {Renato {De Mori} and Brigitte Bigi}, chapter = {Chapter 6: Principles of Speech Recognition}, title ={Spoken Language Processing}, publisher = {Hermès}, pages = {213--238}, ISBN = {978-1-84821-031-8}, year = {2009}, editor = {Joseph Mariani} } @INPROCEEDINGS{do2009taln, author = {T.-N.-D. Do and Viet-Bac Le and Brigitte Bigi and Laurent Besacier and Eric Castelli}, title = {Exploitation d'un corpus bilingue comparable pour la création d'un système de traduction probabiliste Vietnamien-Français}, booktitle = {16èmes conférence annuelle Traitement Automatique des Langues Naturelles}, year = {2009}, address = {Senlis, France}, } @INPROCEEDINGS{do2009wmt, author = {T.-N.-D. Do and Viet-Bac Le and Brigitte Bigi and Laurent Besacier and Eric Castelli}, title = {Mining a comparable text corpus for a Vietnamese-French machine translation system}, booktitle = {Fourth Workshop on Statistical Machine Translation}, year = {2009}, pages = {165-172}, address = {Athens, Greece}, url = {http://www.anthology.aclweb.org/W/W09/W09-0430.pdf}, abstract = {This paper presents our first attempt at constructing a Vietnamese-French statistical machine translation system. Since Vietnamese is an under-resourced language, we concentrate on building a large Vietnamese-French parallel corpus. A document alignment method based on publication date, special words and sentence alignment result is proposed. The paper also presents an application of the obtained parallel corpus to the construction of a Vietnamese-French statistical machine translation system, where the use of different units for Vietnamese (syllables, words, or their combinations) is discussed.} } @INPROCEEDINGS{seng2009interspeech, author = {Sopheap Seng and Laurent Besacier and Brigitte Bigi and Eric Castelli}, title = {Multiple Text Segmentation for Statistical Language Modeling}, booktitle = {Interspeech}, year = {2009}, address = {Brighton, UK}, url = {http://sppas.org/bigi/Doc/send2009interspeech.pdf} } @INPROCEEDINGS{seng2009taln, author = {Sopheap Seng and Laurent Besacier and Brigitte Bigi and Eric Castelli}, title = {Segmentation multiple d'un flux de données textuelles pour la modélisation statistique du langage}, booktitle = {16èmes conférence annuelle Traitement Automatique des Langues Naturelles}, year = {2009}, address = {Senlis, France}, } @INPROCEEDINGS{le2008sltu, author = {Viet-Bac Le and Laurent Besacier and Sopheap Seng and Brigitte Bigi and T-N-D. Do}, title = {Recent advances in Automatic Speech Recognition for Vietnamese}, booktitle = {International Workshop on Spoken Languages Technologies for Under-resourced languages}, year = {2008}, address = {Hanoi, Vietnam}, pages = {47--52} url = {http://sppas.org/bigi/Doc/le2008sltu.pdf} } @INPROCEEDINGS{seng2008sltu, author = {Sopheap Seng and Sethserey Sam and Viet-Bac Le and Brigitte Bigi and Laurent Besacier}, title = {Which unit for acoustic and language modeling for Khmer Automatic Speech Recognition}, booktitle = {International Workshop on Spoken Languages Technologies for Under-resourced languages}, year = {2008}, pages = {33-38}, address = {Hanoi (Vietnam)}, url = {http://sppas.org/bigi/Doc/seng2008sltu.pdf} } @INPROCEEDINGS{le2008icassp, author = {Viet-Bac Le and Sopheap Seng and Laurent Besacier and Brigitte Bigi}, title = {WORD/SUB-WORD lattices decomposition and combination FOR Speech Recognition}, booktitle = {IEEE International conference on Acoustics, Speech and Signal Processing}, year = {2008}, address = {Las Vegas, USA}, pages={4321--4324}, organization={IEEE}, isbn = {978-1-4244-1483-3}, doi = {10.1109/ICASSP.2008.4518611}, url={}, abstract = {This paper presents the benefit of using multiple lexical units in the post-processing stage of an ASR system. Since the use of sub-word units can reduce the high out-of-vocabulary rate and improve the lack of text resources in statistical language modeling, we propose several methods to decompose, normalize and combine word and sub-word lattices generated from different ASR systems. By using a sub-word information table, every word in a lattice can be decomposed into sub-word units. These decomposed lattices can be combined into a common lattice in order to generate a confusion network. This lattices combination scheme results in an absolute syllable error rate reduction of about 1.4% over the sentence MAP baseline method for a Vietnamese ASR task. By comparing with the N-best lists combination and voting method, the proposed method works better.} } @INPROCEEDINGS{seng2008lrec, author = {Sopheap Seng and Sethserey Sam and Laurent Besacier and Brigitte Bigi and Eric Castelli}, title = {First Broadcast News Transcription System for Khmer Language}, booktitle = {Proceedings of the Sixth International Conference on Language Resources and Evaluation}, year = {2008}, pages = {2658--2661}, address = {Marrakech, Morocco}, editor = {Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Daniel Tapias}, publisher = {European Language Resources Association (ELRA)}, isbn = {2-9517408-4-0}, url = {http://lrec-conf.org/proceedings/lrec2008/pdf/661_paper.pdf}, abstract = {In this paper we present an overview on the development of a large vocabulary continuous speech recognition (LVCSR) system for Khmer, the official language of Cambodia, spoken by more than 15 million people. As an under-resourced language, develop a LVCSR system for Khmer is a challenging task. We describe our methodologies for quick language data collection and processing for language modeling and acoustic modeling. For language modeling, we investigate the use of word and sub-word as basic modeling unit in order to see the potential of sub-word units in the case of unsegmented language like Khmer. Grapheme-based acoustic modeling is used to quickly build our Khmer language acoustic model. Furthermore, the approaches and tools used for the development of our system are documented and made publicly available on the web. We hope this will contribute to accelerate the development of LVCSR system for a new language, especially for under-resource languages of developing countries where resources and expertise are limited.} } @INPROCEEDINGS{seng2008jep, author = {Sopheap Seng and Sethserey Sam and Viet-Bac Le and Brigitte Bigi and Laurent Besacier}, title = {Reconnaissance automatique de la parole en langue khmère : Quelles unités pour la modélisation du langage et la modélisation acoustique ?}, booktitle = {XXVIIèmes Journees d'Etudes sur la Parole}, year = {2008}, address = {Avignon, France}, url = {http://www.afcp-parole.org/doc/Archives_JEP/2008_XXVIIe_JEP_Avignon/PDF/avignon2008_pdf/JEP/004_jep_1624.pdf} } @INPROCEEDINGS{bigi2008jadt, author = {Brigitte Bigi and Viet-Bac Le}, title = {Normalisation et alignement de corpus français et vietnamiens : Format et Logiciels}, booktitle = {9èmes journées internationales d'analyse statistique des données textuelles}, address = {Lyon, France}, year = {2008}, pages = {199--207}, url = {http://sppas.org/bigi/Doc/bigi2008jadt.pdf} } @misc{bigi2006itc1, author = {Brigitte Bigi}, title = {Traitement Automatique du Langage Naturel}, address = {Institute of Technology of Cambodia - Phonm Pen, Cambodia}, howpublished = {Seminar}, year = {2006}, url = {http://sppas.org/bigi/Doc/bigi2006_01_TALN.pdf} } @misc{bigi2006itc2, author = {Brigitte Bigi}, title = {Corpus}, address = {Institute of Technology of Cambodia - Phonm Pen, Cambodia}, howpublished = {Seminar}, year = {2006}, url = {http://sppas.org/bigi/Doc/bigi2006_02_CORPUS.pdf} } @misc{bigi2006itc3, author = {Brigitte Bigi}, title = {Modélisation Statistique du Langage (théorie)}, address = {Institute of Technology of Cambodia - Phonm Pen, Cambodia}, howpublished = {Seminar}, year = {2006}, url = {http://sppas.org/bigi/Doc/bigi2006_02_ML1.pdf} } @misc{bigi2006itc4, author = {Brigitte Bigi}, title = {Modélisation Statistique du Langage (pratique)}, address = {Institute of Technology of Cambodia - Phonm Pen, Cambodia}, howpublished = {Seminar}, year = {2006}, url = {http://sppas.org/bigi/Doc/bigi2006_02_ML2.pdf} } @misc{bigi2006clips, author = {Brigitte Bigi}, title = {Corpus et Statistiques}, howpublished = {Seminar}, address ={CLIPS, Grenoble}, year = {2006}, url = "http://sppas.org/bigi/Doc/bigi2006clips_seminar.pdf" } @InProceedings{bigi2006jep, author={Brigitte Bigi}, title={Proposition d'une méthodologie pour la sélection et l'évaluation du vocabulaire d'un système de RAP}, booktitle = {XXVI-èmes Journées d'Etudes sur la Parole}, address = {Dinard, France}, year = {2006}, url = {http://sppas.org/bigi/Doc/bigi2006jep.pdf} } @InProceedings{bigi2004icslp, title={Vocabulary and language model adaptation using information retrieval.}, author={Brigitte Bigi and Yan Huang and Renato {De Mori}}, booktitle={Proceedings of the International Conference on Spoken Language Processing}, volume ={II}, pages = {1361--1364}, address = {Jeju Island, Korea}, year={2004}, url = {http://http.icsi.berkeley.edu/ftp/global/pub/speech/papers/icslp2004-bigi.pdf}, abstract = {The goal of vocabulary optimization is to construct a vocabulary with exactly those words that are the most likely to appear in the test data. We will present a new approach to reduce the out-of-vocabulary (OOV) rate by adapting the vocabulary model during the ASR process. This method can also be used for the statistical language model (SLM) adaptation. An information retrieval system is used after the first pass of the ASR system to obtain a set of relevant documents. These documents are then used to generate the new vocabulary and/or corpus. In this paper, we propose a new retrieving method well-adapted for this purpose. Experiments were carried out on French with a 28% OOV rate reduction. Experiments were also carried out on English for the SLM adaptation, with 7.9% perplexity reduction, and minor WER improvement.} } @InProceedings{lamy2004jep, title={Premiers pas du CLIPS sur les données d'évaluation ESTER}, author={Richard Lamy and Daniel Moraru and Brigitte Bigi and Laurent Besacier}, booktitle={Proc. of Journées d’Etude de la Parole, Fès, Maroc}, year={2004}, address = {Fès, Maroc}, url = {http://www.afcp-parole.org/doc/Archives_JEP/2004_XXVe_JEP_Fes/actes/jep2004/Lamy-Moraru-etal.pdf} } @InProceedings{vu2004taln, author={Quang Vu-minh and Laurent Besacier and Hervé Blanchon and Brigitte Bigi}, title={Modèle de langage sémantique pour la RAP dans un contexte de traduction}, booktitle={Proc. of 11èmes conférence annuelle Traitement Automatique des Langues Naturelles}, address = {Fès, Maroc}, year={2004}, url = {http://sppas.org/bigi/Doc/bigi2004taln.pdf} } @InProceedings{vu2004rivf, title={Interchange Format-based Language Model for Automatic Speech Recognition in Speech-to-Speech Translation.}, author={Minh-Quang Vu and Laurent Besacier and Eric Castelli and Brigitte Bigi and Hervé Blanchon}, booktitle={Second international Conference RIVF}, pages={47--50}, address={Hanoi, Vietnam}, year={2004}, url = {http://sppas.org/bigi/Doc/bigi2004rivf.pdf} } @Article{bigi2003ecir, author="Brigitte Bigi", editor="Fabrizio Sebastiani", title="Using Kullback-Leibler Distance for Text Categorization", journal="Advances in Information Retrieval: 25th European Conference on IR Research", address = {Pisa, Italy}, year="2003", publisher="Springer Berlin Heidelberg", address="Berlin, Heidelberg", pages="305--319", volume={}, isbn="978-3-540-36618-8", doi="10.1007/3-540-36618-0_22", url="http://dx.doi.org/10.1007/3-540-36618-0_22", abstract = {A system that performs text categorization aims to assign appropriate categories from a predefined classification scheme to incoming documents. These assignments might be used for varied purposes such as filtering, or retrieval. This paper introduces a new effective model for text categorization with great corpus (more or less 1 million documents). Text categorization is performed using the Kullback-Leibler distance between the probability distribution of the document to classify and the probability distribution of each category. Using the same representation of categories, experiments show a significant improvement when the above mentioned method is used. KLD method achieve substantial improvements over the tfidf performing method.} } @InProceedings{le2003eurospeech, title={Using the web for fast language model construction in minority languages.}, author={Viet-Bac Le and Brigitte Bigi and Laurent Besacier and Eric Castelli}, booktitle={Eurospeech}, address = {Geneva}, pages = {3117--3120}, year={2003}, organization={Citeseer}, abstract = {The design and construction of a language model for minority languages is a hard task. By minority language, we mean a language with small available resources, especially for the statistical learning problem. In this paper, a new methodology for fast language model construction in minority languages is proposed. It is based on the use of Web resources to collect and make efficient textual corpora. By using some filtering techniques, this methodology allows a quick and efficient construction of a language model with a small cost in term of computational and human resources. Our primary experiments have shown excellent performance of the Web language models vs newspaper language models using the proposed filtering methods on a majority language (French). Following the same way for a minority language (Vietnamese), a valuable language model was constructed in 3 month with only 15% new development to modify some filtering tools.} } @Article{bigi2002iasted, title={Text segmentation using a cache memory}, author={Brigitte Bigi and Renato {De Mori}}, journal={Control and intelligent systems}, volume={30}, number={3}, pages={93--100}, year={2002}, publisher={Acta Press}, isbn = {paper no. 201-1191, issn:1480-1752A}, url = {http://sppas.org/bigi/Doc/bigi2002iasted-authorversion.pdf}, abstract = {This paper describes the application of an information-theoretic approach to document segmentation. Several segmentation criteria are proposed using topic shift detection or just blindly comparing the con- tents of cache memories where keywords are temporarily stored as a document is analyzed. Experiments with a large corpus of articles from the French newspaper Le Monde show tangible advantages when different models are combined with a suitable strategy. Experimental results show that different strategies for topic shift detection have to be used depending on whether high recall or high precision are sought. Furthermore, methods based on topic independent distributions provide complementary candidates with respect to the use of topic-dependent distributions leading to an increase in recall with a minor loss in precision.} } @InProceedings{bigi2002specom, title={Dynamic topic identification: Introduction of trigger pairs in the cache model}, author={Brigitte Bigi and Salma Jamoussi and Kamel Smaïli}, booktitle={Proceedings of the international workshop Speech and Computer}, year={2002}, address = {Moscow, Russia}, url = {https://hal.inria.fr/inria-00100828/} } @InBook{demori2002, author = {Renato {De Mori} and Brigitte Bigi}, chapter = {Chapitre 5 : Principes de reconnaissances}, publisher = {Ouvrage Traitement automatique du langage parlé, Editions Hermès}, title = {Traitement Automatique du Langage Parlé, Tome 2 : Reconnaissance}, year = {2002}, editor = {Joseph Mariani}, isbn = {2-7462-0441-X} } @InProceedings{bigi2002taln, title={Identification thématique hiérarchique: Application aux forums de discussions}, author={Brigitte Bigi and Kamel Smaïli}, booktitle={9ème conférence sur le Traitement Automatique des Langues Naturelles}, volume={1}, pages={115--124}, year={2002} } @Article{carpineto2001, author = {Claudio Carpineto and Renato {De Mori} and Giovanni Romano and Brigitte Bigi}, title = {An Information Theoretic Approach to Automatic Query Expansion}, year = {2001}, volume = {19}, pages = {1--27}, number = {1}, journal = {ACM Transactions On Information Systems}, url = {http://dl.acm.org/citation.cfm?doid=366836.366860}, abstract = {Techniques for automatic query expansion from top retrieved documents have shown promise for improving retrieval effectiveness on large collections; however, they often rely on an empirical ground, and there is a shortage of cross-system comparisons. Using ideas from Information Theory, we present a computationally simple and theoretically justified method for assigning scores to candidate expansion terms. Such scores are used to select and weight expansion terms within Rocchio's framework for query reweigthing. We compare ranking with information-theoretic query expansion versus ranking with other query expansion techniques, showing that the former achieves better retrieval effectiveness on several performance measures. We also discuss the effect on retrieval effectiveness of the main parameters involved in automatic query expansion, such as data sparseness, query difficulty, number of selected documents, and number of selected terms, pointing out interesting relationships.} } @InProceedings{bigi2001ranlp, author = {Brigitte Bigi and Armelle Brun and Jean-Paul Haton and Kamel Smaïli and Imed Zitouni}, title = {Dynamic Topic Identification: Towards Combination of Methods}, booktitle = {Proceedings of the Recent Advances in Natural Language Processing}, year = {2001}, address = {Tzigov Chark, Bulgaria}, pages = {255--257}, url = {https://hal.inria.fr/inria-00100481/}, abstract = {This paper presents several statistical methods for topic identification (TID): topic unigrams, cache model, TFIDF classifier, topic perplexity, and weighted model. Our work aims to improve these methods by confronting them to very different data, measuring their potential complementarity and their TID performance with simple conbinations. Statistical topic identification methods depend not only on a corpus, but also on its type. This study allows to advance the cache model which achieves a TID performance of 82%. This performance has been increased to 82.3% with our best linear combination.} } @InProceedings{bigi2001spire, author = {Brigitte Bigi and Armelle Brun and Jean-Paul Haton and Kamel Smaïli and Imed Zitouni}, title = {A Comparative Study of Topic Identification on Newspaper and E-mail}, booktitle = {Proceedings of the 8th International Symposium on String Processing and Information Retrieval Conference, Sponsored by IEEE Computer Society}, address = "Laguna de San Rafael, Chili", year = {2001}, pages = {238--241}, url = {https://hal.inria.fr/inria-00107535}, abstract = {This paper presents several statistical methods for topic identification on two kinds of textual data: newspaper articles and e-mails. Five methods are tested on these two corpora: topic unigrams, cache model, TFIDF classifier, topic perplexity, and weighted model. Our work aims to study these methods by confronting them to very different data. This study is very fruitful for our research. Statistical topic identification methods depend not only on a corpus, but also on its type. One of the methods achieves a topic identification of 80% on a general newspaper corpus but does not exceed 30% on e-mail corpus. Another method gives the best result on e-mails, but has not the same behavior on a newspaper corpus. We also show in this paper that almost all our methods achieve good results in retrieving the first two manually annotated labels.} } @InProceedings{bigi2001specom, author = {Brigitte Bigi and Armelle Brun and Kamel Smaïli and Jean-Paul Haton}, title = {A Hierarchical Approach for Topic Identification}, booktitle = {Proceedings of the international workshop Speech and Computer}, year = {2001}, address = {Moscow, Russia}, url = {https://hal.inria.fr/inria-00107536/}, abstract = {This paper focuses on language model adaptation, and more especially on topic identification (TID) for Automatic Speech Recognition (ASR). The structure of a set of topics is redefined by the introduction of a hierarchy. TID models may then make use of the semantic relationships between parent and son nodes of the topic-tree. The originality of the approach presented in this article lies in the allocation of a unique vocabulary to brother nodes, which rests on the use of two backing-off levels. In comparison with TID performance when using a non-hierarchical approach, results encourage us to carry on in this way.} } @ARTICLE{bigi2000sp, author = {Brigitte Bigi and Renato {De Mori} and Marc El-Bèze and Thierry Spriet}, title = {A Fuzzy Decision Strategy for Topic Identification and Dynamic Selection of Language Models}, journal = {Special Issue on Fuzzy Logic in Signal Processing, Signal Processing Journal}, publisher = {Elsevier Science Journals}, volume = {80}, number = {6}, pages = {1085--1097}, year = {2000}, url = {http://sppas.org/bigi/Doc/bigi2000spj-authorversion.pdf}, abstract = {The paper introduces a new effective model for topic recognition. The model follows a multi-expert decision paradigm based on fuzzy relations in which fuzzy variables express degrees of reliability of expert decision. Heterogeneous measures are integrated by the fuzzy relations whose structure and components may evolve in time. Experiments resulted in more than 80% topic classification accuracy on articles of the French newspaper Le Monde which describe a very large variety of facts with a very large vocabulary (of the order of 500,000 words). Experiments show a significant improvement when the above mentioned integration of multi-expert decision is used. A robust strategy for dynamic Language Model (LM) selection, based on topic recognition and switching between topic models, is proposed. It is effective because it relies on a small set of well trained topic-dependent LMs and on reliable topic recognition. By using perplexity as a performance measure of the LM switching model, a tangible reduction is observed with respect to the use of a single, general, static LM.} } @InProceedings{bigi2000jep, author = {Brigitte Bigi and Renato {De Mori} and Thierry Spriet}, title = {Reconnaissance thématique à partir de textes dictés et Adaptation dynamique de modèles de langages thématiques}, booktitle = {XXIIIèmes Journées d'Etudes sur la Parole}, year = {2000}, pages = {301-304}, address = {Aussois, France}, url = {http://sppas.org/bigi/Doc/bigi2000jep.pdf} } @misc{bigi1999rjc, author = {Brigitte Bigi}, title = {Sélection dynamique de modèles de langage en RAP}, booktitle = {Rencontre des jeunes chercheurs en parole}, year = {1999}, address = {Avignon, France} } @InProceedings{bigi1998icslp, author = {Brigitte Bigi and Renato {De Mori} and Marc El-Bèze and Thierry Spriet}, title = {Detecting topic shifts using a cache memory}, booktitle = {5th International Conference on Spoken Language Processing}, year = {1998}, address = {Sydney, Australia}, url = {http://sppas.org/bigi/Doc/bigi1998icslp.pdf}, abstract = {The use of cache memories and symmetric Kullback-Leibler distances is proposed for topic classification and topic-shift detection. Experiments with a large corpus of articles from the French newspaper "Le Monde show tangible advantages when different models are combined with a suitable strategy. Experimental results show that different strategies for topic shift detection have to be used depending on whether high recall or high precision are sought. Furthermore, methods based on topic independent distributions provide complementary candidates with respect to the use of topic-dependent distributions leading to an increase in recall with a minor loss in precision.} } @InProceedings{bigi1998jep, author = {Brigitte Bigi and Renato {De Mori} and Marc El-Bèze and Thierry Spriet}, title = {Combinaison de modèles de langage pour l'identification de thèmes}, booktitle = {XXIIèmes Journées d'Etudes sur la Parole}, year = {1998}, pages = {347-350}, address = {Martigny, Suisse}, url = {http://sppas.org/bigi/Doc/bigi1998jep.pdf} } @InProceedings{bigi1997asru, author = {Brigitte Bigi and Renato {De Mori} and Marc El-Bèze and Thierry Spriet}, title = {Combined models for topic spotting and topic-dependent language modeling}, booktitle = {IEEE Workshop on Automatic Speech Recognition and Understanding Proceedings}, year = {1997}, editor = {S. Furui, B. H. Huang and Wu Chu, IEEE Signal Processing Society Publ, NY}, pages = {535-542} address = "Santa Barbara, USA", doi = {10.1.1.40.2668}, url = {http://sppas.org/bigi/Doc/bigi1997asru.pdf}, abstract = {A new statistical method for Language Modeling and spoken document classification is proposed. It is based on a mixture of topic dependent probabilities. Each topic dependent probability is in turn a mixture of n-gram probabilities and the probability of Kullback-Lieber (KL) distances between key-word unigrams and distribution obtained from the content of a cache memory. Experimental result on topic classification using a corpus of 60 Mword from the French newspaper Le Monde show the excellent performance of the cache memory and its complementary role in providing different statistics for the decision process.} } @MastersThesis{Bigi2000phd, author = {Brigitte Bigi}, title = {Contribution à la modélisation du langage pour des applications de recherche documentaire et de traitement de la parole}, organization = {Laboratoire Informatique d'Avignon}, school = {Avignon University}, type = {Phd's Thesis}, year = {2000}, type = {Doctoral Dissertation}, url = {http://sppas.org/bigi/Doc/bigi2000these.pdf} } @misc{bigi1997rjc, author = {Brigitte Bigi}, title = {Modèles de langage basés sur l'identification de thèmes}, booktitle = {Rencontre des jeunes chercheurs en parole}, year = {1997}, address = {La Rochelle, France} } @MastersThesis{bigi1997dea, author = {Brigitte Bigi}, title = {Combinaison de modèles de langages thématiques}, type = {Master's Thesis}, organization = {Department of Computer Science and Engineering of Luminy}, school = {Aix-Marseille Univ}, year = {1997} }