@article {524, title = {Handy prosody: how hands can help you hear}, year = {2023}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {Utrecht, The Netherlands}, abstract = {Speech conveys both segmental information about vowels and consonants as well as suprasegmental information about for instance intonation, rhythm, and lexical stress; also known as the prosody of speech. However, in face-to-face conversations, we do not only exchange sounds; we also move, nod, and gesture to the rhythm of our speech. In this keynote, I will demonstrate how the timing of hand gestures contributes to audiovisual prosody perception, with a focus on lexical stress. For instance, evidence for a {\textquoteleft}manual McGurk effect{\textquoteright} showcases how even relatively simple flicks of the hands can guide whether you hear "PLAto" or "plaTEAU". Moreover, human listeners are shown to actively weigh various multisensory cues to prosody depending on the listening conditions at hand. Thus, these findings emphasize that prosody is a multimodal linguistic phenomenon, with the voice, lips, and even hands conveying prosody in concert.}, author = {Hans Rutger Bosker} } @article {513, title = {Both Contextual and Talker-Bound F0 Information Affect Voiceless Fricative Perception}, year = {2022}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {Utrecht, The Netherlands}, abstract = {Speech perception is sensitive to context. An example of this is the contrastive effect of fundamental frequency (F0) on the perception of voiceless fricatives{\textquoteright} spectral center of gravity (CoG) (e.g., Niebuhr, 2017). However, whether knowledge about a talker{\textquoteright}s characteristic mean F0 can produce similar effects remains unknown. The present study therefore investigated the effects of contextual (Exp 1) and talker-bound (Exp 2) F0 information on the perception of the voiceless fricatives /s/ and /ʃ/. In Experiment 1, in a 2AFC task, native Dutch listeners (N=10) categorized target words as the Dutch words {\textquotedblleft}sok{\textquotedblright} /sɔk/ or {\textquotedblleft}sjok{\textquotedblright} /ʃɔk/ embedded in a carrier sentence ({\textquotedblleft}Nu komt het woord...{\textquotedblright}) in 3 intermixed F0 conditions. The fricatives were tokens from a synthetic 8-step fricative continuum from /s/ to /ʃ/. The carrier sentence was pitch shifted {\textpm}4 semitones to create High-F0 and Low-F0 context conditions, alongside a Mid-F0 (i.e., non-shifted) control condition. Ambiguous fricatives were perceived as more /s/-like in Low-F0 sentences compared to High-F0 sentences. In Experiment 2, new participants (N=32) first listened to 20 minutes of speech (exposure) from the same talker whose voice had been consistently pitch-shifted up (High-F0 group) or down (Low-F0 group) {\textpm}4 semitones. Afterwards, a 5-step subset of the original 8-step fricative continuum was used in a 2AFC task where participants categorized stimuli without carrier sentences as {\textquotedblleft}sok{\textquotedblright} or {\textquotedblleft}sjok{\textquotedblright}. The continuum was again perceived as more /s/-like for the Low-F0 group compared to the High-F0 group. Together, the findings suggest that listeners use not only the immediate context but also previously established knowledge about talkers{\textquoteright} typical F0 to interpret incoming speech sounds. References Niebuhr, O. (2017). On the perception of {\textquotedblleft}segmental intonation{\textquotedblright}: F0 context effects on sibilant identification in German. EURASIP Journal on Audio, Speech, and Music Processing, 2017(1), 19. https://doi.org/10.1186/s13636-017-0115-3 }, author = {Ulu{\c s}ahin, Orhun and Hans Rutger Bosker and James M. McQueen and Meyer, Antje} } @article {512, title = {Recalibration of lexical stress perception can be driven by visual beat gestures}, year = {2022}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {Utrecht, The Netherlands}, abstract = {Auditory speech is highly variable. Listeners may therefore use the visual modality to disambiguate ambiguous speech sounds. For instance, when repeatedly presented with an ambiguous sound /a?a/ midway between /aba/ and /ada/, paired with a video of a talker producing either /aba/ or /ada/, listeners recalibrate their perception of a later presented auditory /aba - ada/ continuum (Bertelson et al., 2003). Here we tested whether recalibration can also occur for lexical stress perception. In Experiment 1 participants were exposed to an ambiguously stressed token of /ka.nɔn/, perceptually midway between Dutch CAnon [strong-weak (SW); {\textquotedblleft}canon{\textquotedblright}] and kaNON [weak-strong (WS); {\textquotedblleft}cannon{\textquotedblright}], disambiguated either by a beat gesture aligned to the first or second syllable (visual). In a later test phase participant categorized an auditory CAnon {\textendash} kaNON lexical stress continuum. The results revealed that participants{\textquoteright} responses in the test phase shifted in the direction of the disambiguating beat gestures they saw in the exposure phase. In Experiment 2 participants were exposed to a different ambiguous word (/vo:r.na:m/) but tested on the same CAnon {\textendash} kaNON continuum, to test if the effect would also generalize to different words. However, results show that participants were not able to generalize. Ongoing work is investigating whether this generalization is modulated by acoustic distance. Nonetheless, the effect was clearly present across multiple auditory steps in Experiment 1. Therefore, we suggest that beat gestures can recalibrate lexical stress perception and thus have a long-lasting effect on auditory perception.}, author = {Bujok, Ronny and Peeters, David and Meyer, Antje and Hans Rutger Bosker} } @article {487, title = {Jouw {\textquotedblleft}voornaam{\textquotedblright} is niet mijn {\textquotedblleft}voornaam{\textquotedblright}: An acoustic analysis of individual talker differences in producing lexical stress in Dutch}, year = {2021}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {online}, abstract = {Different people talk differently, even speakers from the same region. This individual variability results in large acoustic variability in speech, both at the segmental level (productions of vowels and consonants) and the suprasegmental, or prosodic, level (e.g., lexical stress). While individual differences in segment production are well established in the literature, relatively little is known about how individual talkers differ in their prosody. The present study examined individual-talker differences in productions of lexical stress. We recorded 744 tokens of Dutch segmentally overlapping words (e.g., VOORnaam vs. voorNAAM; {\textquoteleft}first name{\textquoteright} vs. {\textquoteleft}respectable{\textquoteright}) in variable sentence contexts from 40 native speakers of Dutch (balanced gender; relatively homogeneous Nijmegen-centered sample), and measured acoustic cues to lexical stress (mean F0, F0 variation, duration, spectral tilt, intensity, and vowel quality). Linear Discriminant Analyses (LDA) on data from each individual participant yielded sets of cue-weights for each participant, informing us on their phonetic cue-weighting strategies. Results showed {\textendash} on top of a general trend to primarily use mean F0, intensity, and duration {\textendash} that each participant also employed a unique combination of cues to signal lexical stress, illustrating large prosodic variability between talkers. Moreover, classes of cue-weighting strategies emerged, with a large group of primarily F0-weighting talkers and another group of primarily intensity-weighting talkers. Furthermore, based on LDA accuracy scores, we confirmed that spectral tilt was a more reliable cue to lexical stress than intensity for /a:/. However, when 9 other vowels were included in the analysis, this advantage disappeared, suggesting that for a larger sample of Dutch vowels both cues are equally important. Together, these outcomes contribute to a more comprehensive acoustic description of lexical stress in Dutch, allowing group-level and individual-talker inferences.}, author = {Severijnen, Giulio G.A. and Hans Rutger Bosker and James M. McQueen} } @article {472, title = {Automatic assessment of transcript accuracy for speech intelligibility studies}, year = {2020}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {online}, abstract = {In the field of speech perception, many studies assess the intelligibility of spoken stimuli by means of verbal repetition ({\textquoteleft}repeat back what you hear{\textquoteright}) or transcription tasks ({\textquoteleft}type out what you hear{\textquoteright}). The intelligibility of a given stimulus is then often expressed in terms of percentage of words correctly reported from the target stimulus. Yet scoring the participants{\textquoteright} raw transcripts for words correctly identified from the target stimulus is a time-consuming task, and hence resource-intensive. Moreover, there is no consensus on what protocol to use for the human scoring, limiting the reliability of human scores. The present paper evaluates various forms of {\textquoteleft}fuzzy string matching{\textquoteright} between participants{\textquoteright} responses and target sentences as automated metrics of listener transcript accuracy. Fuzzy string matching is identified as a consistent, efficient, and accurate method for automated assessment of listener transcripts, as evidenced by high correlations with human-generated scores (highest r = 0.94) and a strong relationship to acoustic markers of speech intelligibility. Thus, fuzzy string matching provides a practical tool for speech scientists, allowing fast and reliable assessment of listener transcript accuracy in large-scale speech intelligibility studies.}, author = {Hans Rutger Bosker} } @article {480, title = {Listeners learn and predict talker-specific prosodic cues in speech perception}, year = {2020}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {online}, abstract = {One of the challenges in speech perception is that listeners must deal with considerable segmental and suprasegmental variability in the acoustic signal due to differences between talkers. Most previous studies have focused on how listeners deal with segmental variability. In this EEG experiment, we investigated how listeners track talker-specific usage of suprasegmental cues to lexical stress to correctly recognize spoken words. In a 3-day training phase, Dutch participants learned to map non-word minimal stress pairs onto different object referents (e.g., USklot means {\textquotedblleft}lamp{\textquotedblright}; usKLOT means {\textquotedblleft}train{\textquotedblright}). These non-words were produced by two male talkers. Critically, each talker only used one suprasegmental cue to signal lexical stress (e.g., Talker A only used F0, Talker B only amplitude). We expected participants to learn which talker used which cue to signal stress. In the test phase, participants indicated whether spoken sentences including these non-words were correct ({\textquotedblleft}The word for {\textquoteleft}lamp{\textquoteright} is...{\textquotedblright}). We recorded participants{\textquoteright} response times and EEG patterns, targeting an ERP related to phonological prediction: the N200. We found that participants were slower to indicate that a stimulus was correct if the non-word was produced with the unexpected cue (e.g., Talker A using amplitude). That is, if in training Talker A used F0 to signal stress, participants experienced a mismatch between predicted and perceived phonological word-forms if, at test, Talker A unexpectedly used amplitude as cue to stress. This illustrates talker-specific prediction of suprasegmental cues, picked up through perceptual learning in training. In contrast the N200 amplitude, was not modulated by the mismatch. Theoretical implications for these results are discussed.}, author = {Severijnen, Giulio G.A. and Hans Rutger Bosker and Piai, Vitoria and James M. McQueen} } @article {21, title = {Foreign languages sound fast: evidence for the {\textquoteleft}Gabbling Foreigner Illusion{\textquoteright}}, year = {2017}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {Amsterdam, The Netherlands}, abstract = {

Anecdotal evidence suggests that unfamiliar languages sound faster than one{\textquoteright}s native language. Empirical evidence for this impression has come from explicit tempo judgments. However, it is unknown whether such perceived rate differences between native and foreign languages (FLs) have effects on implicit speech processing.

Our measure of implicit perception was {\textquoteleft}rate normalization{\textquoteright}: Dutch and German listeners interpret vowels midway between /ɑ/ and /a:/ more often as /a:/ if the target vowel follows a fast (vs. slow) sentence. We asked whether such a {\textquoteleft}rate normalization{\textquoteright} effect may be observed when the context is not actually faster but simply spoken in a foreign language.

Dutch and German participants listened to Dutch and German (rate-matched) fast and slow sentences, followed by non-words that contained vowels from an /a-a:/ duration continuum. Participants indicated which vowel they heard (fap vs. faap). Across three experiments, we consistently found that German listeners reported more /a:/ responses after foreign sentences (vs. native), suggesting that foreign sentences were indeed perceived as faster. However, mixed results were found for the Dutch groups. We conclude that the subjective impression that FLs sound fast may have an effect on implicit speech processing, influencing how language learners perceive spoken segments in a FL.

}, author = {Hans Rutger Bosker} } @article {35, title = {How speech rate shapes perception}, year = {2015}, publisher = {Nederlandse Vereniging voor Fonetische Wetenschappen}, address = {Utrecht, The Netherlands}, abstract = {

Speech can be delivered at different rates and, as a consequence, listeners have to normalize the incoming speech signal for the rate at which it was produced. This perceptual process, known as rate normalization, is contrastive in nature: for instance, the perception of an ambiguous Dutch vowel in between short /ɑ/ and long /a:/ is biased towards hearing long /a:/ when preceded by a fast sentence context.

Rate normalization has (primarily) been explained in terms of durational contrast: the ambiguous vowel is perceived as longer because it has a relatively long duration compared to the preceding shorter vowels in the fast context. In this presentation, novel experimental data will be presented that challenge this account of durational contrast by (1) demonstrating that it is the contextual rate, not duration, that elicits rate normalization; and (2) suggesting that vowel categorization is sensitive to the phase of the contextual rhythm.

In order to explain these new findings, a neurobiologically plausible account of rate normalization is proposed involving neural entrainment of endogenous brain oscillations to the speech rate of the spoken signal.

}, author = {Hans Rutger Bosker} }