login

AI 1—Intelligent Speech Technology

ICT

STEAM

Week 1

PyCharm, Python script demonstrates how to run simple online speech recognition and generation using recognize_google and edge_tss. Syntax: import, print, function, parameters.

Introduce the Unihiker K10, demonstrate offline speech recognition and synthesis (Mind+ programming), and control the lights.

P.S.: Remind students to register a Xiaozhi account.

 

Week 2

Offline speech recognition and generation (FFMPEG and model pre-installed), using whisper and pyttsx3

Syntax: data type, input, comment

Laser cutting basics instruction, shape design (related to community elderly care scenarios), and drawing (preparing basic shapes).

Week 3

Voice chat, different text outputs correspond to different voice messages.

Syntax: if, elif, else

Installation and testing: install the Xiaozhi firmware (guidebook, students can flash it themselves), and change the smart agent settings in the Xiaozhi backend.

Week 4

Presentation PPT, Poster

Presentation, Peer assessment

ICT代码

# -----------------------------------

# 语音合成(在线)
# pip install edge_tts 
import edge_tts
import os

text = "不用第三方库也能播放语音"
voice = "zh-CN-XiaoxiaoNeural"
file = "edge_tts.mp3"

comm = edge_tts.Communicate(text,voice)
comm.save_sync(file)

os.startfile(file)

# -----------------------------------

# 语音合成(离线)
# pip install pyttsx3 pywin32
# import pyttsx3

# e = pyttsx3.init()
# txt = "离线语音合成测试"
# e.say(txt)
# e.save_to_file(txt,"pyttsx3.wav")
# e.runAndWait()

# --------------------------------------

# 语音识别(在线)
# pip install speech_recognition, pyaudio
# import speech_recognition as sr

# r = sr.Recognizer()
# mic = sr.Microphone()
# print("开始录音,停止1秒后识别结果")
# mic.__enter__()
# audio = r.listen(mic)
# text = r.recognize_google(audio)
# print("Result ", text)

# --------------------------------------

# 语音识别(离线)
# pip install speech_recognition SpeechRecognition[faster-whisper]
# 下载ffmpeg, https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-full-shared.7z
# 解压后把 bin/ffmpeg.exe 放到 C:\Windows\System32 里就行
# import speech_recognition as sr

# r = sr.Recognizer()
# mic = sr.Microphone()
# print("开始录音,停止1秒后识别结果")
# mic.__enter__()
# audio = r.listen(mic)
# text = r.recognize_whisper(audio)
# print("Result ", text)

STEAM代码

import asr
import time
#asr.ASR_MODE_SINGLE 单次
#asr.ASR_MODE_CONTINUOUS 连续
#asr.init_asr(6000,asr.ASR_MODE_SINGLE)#配置检测时间和识别模式
asr.init_asr()

def cb(data):
    if data == 1:
        asr.add_tts_data("灯光已打开")
    else:
        asr.add_tts_data("灯光已关闭")
    print("ID:", data)
    
#注册识别处理回调函数
asr.set_asr_callback(cb)
#启动语音识别
asr.start_asr()
#启动语音合成
asr.start_tts()
#设置命令词条
asr.add_asr_command(1,"kai deng")
asr.add_asr_command(2,"guan deng")
print("请说‘你好,小鑫’唤醒")
try:
    while(True):
        time.sleep(1)

except KeyboardInterrupt:
    print("\n捕获到 Ctrl+C 中断!")
    #释放语音识别资源
    asr.free_asr()

本地服务器

K10

# MindPlus
# DFRobot, 行空板 K10
from k10_base import WiFi
from unihiker_k10 import speaker,mic
import urequests  # 必须加这个
# 主程序开始
wifi = WiFi()
wifi.connect(ssid="Oldmoon Pura 80 Pro",psd="98765431",timeout=50000)
while not (wifi.status()):
    pass
print(eval(wifi.info())[0])
#print(dir(speaker))
# 语音合成
res = urequests.get("http://192.168.43.8:8000/tts")
f = open("/mindplus/tts_out.wav","wb")
f.write(res.content)
speaker.play_sys_music("tts_out.wav")

# 语音识别
mic.recode_sys(name="sound.wav",time=5)
f = open("/mindplus/sound.wav", "rb")
data = f.read()
res = urequests.post(
    "http://192.168.43.8:8000/asr",
    data=data  # 直接发二进制
)
print("识别结果:", res.text)

Fastapi

import edge_tts,io
import pyttsx3
import speech_recognition as sr

#K10只能录放wav,所以edge_tts默认是MP3不行
@app.get("/tts")
async def tts(txt: str = "不用第三方库也能播放语音"):
    e = pyttsx3.init()
    e.save_to_file(txt,"pyttsx3.wav")
    e.runAndWait()
    return FileResponse("pyttsx3.wav", media_type="audio/wav")
    # buf = io.BytesIO()
    # async for d in edge_tts.Communicate(txt,"zh-HK-HiuGaaiNeural").stream():
    #     if d["type"]=="audio":buf.write(d["data"])
    # buf.seek(0)
    # return StreamingResponse(buf,media_type="audio/mpeg")

# K10只能上传二进制,不能用传统文件上传
@app.post('/asr')
async def asr(request: Request):
    r = sr.Recognizer()
    try:
        # 读取K10发来的音频二进制
        audio_data = await request.body()
        audio_file = io.BytesIO(audio_data)
        
        with sr.AudioFile(audio_file) as source:
            audio = r.record(source)

        text = r.recognize_google(audio,language="zh-CN") #中文要定language参数
       
        return text
    except Exception as e:
        return f"识别失败:{type(e).__name__} {str(e)}"

login