[TOC]
wave
wave 模块提供了一个处理 WAV 声音格式的便利接口。它不支持压缩/解压,但是支持单声道/立体声。
install
pip install wave
example
# 读取音频文件数据
def read_wav(audio_name):
with wave.open(audio_name, 'rb') as rf:
frames = []
frame = rf.readframes(3200)
while frame:
frames.append(frame)
frame = rf.readframes(3200)
return frames
# 保存音频文件数据
def write_wav(save_file, frames, CHANNELS=1, SIMPLE_SIZE=2, RATE=16000):
if save_file is not None:
wf = wave.open(save_file, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(SIMPLE_SIZE)
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
wave可以读取和保存音频文件,但是不能做时频处理、特征提取等问题,如果你读取rate=16000的文件,保存为rate=8000的文件,音频的时长增加了一倍,播放速度降低了一倍。
参考:https://docs.python.org/zh-cn/3/library/wave.html#module-wave
Librosa
Librosa 是一个用于音频、音乐分析、处理的python工具包,一些常见的时频处理、特征提取、绘制声音图形等功能应有尽有,功能十分强大。
install
pip install librosa
# conda install
conda install -c conda-forge librosa
example
# 改变频谱并保存
def change_sample_rate(read_file, save_file, orig_sr=48000, target_sr=8000):
y, sr = librosa.load(read_file, sr=orig_sr)
y_16k = librosa.resample(y, sr, target_sr)
librosa.output.write_wav(save_file, y_16k, target_sr)
参考: http://librosa.github.io/librosa/tutorial.html
pyaudio
pyaudio是一个可以读取麦克风和音频文件和播放音频的Python模块。
install
pip install pyaudio
example
wave读取音频文件,pyaudio实现播放音频
"""PyAudio Example: Play a wave file."""
import pyaudio
import wave
import sys
CHUNK = 1024
if len(sys.argv) < 2:
print("Plays a wave file.\n\nUsage: %s filename.wav" % sys.argv[0])
sys.exit(-1)
wf = wave.open(sys.argv[1], 'rb')
# instantiate PyAudio (1)
p = pyaudio.PyAudio()
# open stream (2)
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
# read data
data = wf.readframes(CHUNK)
# play stream (3)
while len(data) > 0:
stream.write(data)
data = wf.readframes(CHUNK)
# stop stream (4)
stream.stop_stream()
stream.close()
# close PyAudio (5)
p.terminate()
读取麦克风并通过阿里语音识别API实时识别。
class MyCallback(SpeechRecognizerCallback):
def __init__(self, name='default'):
self._name = name
self.completed = None
self.result = None
def on_started(self, message):
print('MyCallback.OnRecognitionStarted: %s' % message)
def on_result_changed(self, message):
self.result = message['payload']['result']
print(self.result)
def on_completed(self, message):
self.completed = {'status': message['header']['status'], 'file': self._name,
'task_id': message['header']['task_id'],
'result': message['payload']['result']}
def on_task_failed(self, message):
print('MyCallback.OnRecognitionTaskFailed: %s' % message)
def on_channel_closed(self):
print('MyCallback.OnRecognitionChannelClosed')
class Ali_Speech():
def __init__(self):
access_key_id = 'access_key_id'
access_key_secret = 'access_key_secret'
self.token, _ = ali_speech.NlsClient.create_token(access_key_id, access_key_secret)
self.client = ali_speech.NlsClient()
self.client.set_log_level('INFO')
self.callback = MyCallback()
self.CHUNK = 8092
self.FORMAT = 8
self.CHANNELS = 1
self.RATE = 16000
def start(self):
self.p = pyaudio.PyAudio()
self.stream = self.p.open(format=self.FORMAT, channels=self.CHANNELS,
rate=self.RATE, input=True, frames_per_buffer=self.CHUNK)
def stop(self):
self.stream.stop_stream()
self.stream.close()
self.p.terminate()
def ali_api(self, record_seconds=60, wave_save_path=None):
self.recognizer = self.client.create_recognizer(self.callback)
self.recognizer.set_appkey("set_appkey")
self.recognizer.set_token(self.token)
self.recognizer.set_format(ASRFormat.PCM)
self.recognizer.set_sample_rate(ASRSampleRate.SAMPLE_RATE_16K)
self.recognizer.set_enable_intermediate_result(True)
self.recognizer.set_enable_punctuation_prediction(True)
self.recognizer.set_enable_inverse_text_normalization(True)
RECORD_SECONDS = record_seconds
try:
ret = self.recognizer.start()
if ret < 0:
return ret
for i in range(0, int(self.RATE / self.CHUNK * RECORD_SECONDS)):
data = self.stream.read(self.CHUNK)
ret = self.recognizer.send(data)
if ret < 0:
break
self.recognizer.stop()
res = self.callback.completed
return res
except Exception as e:
print(str(e))
finally:
self.recognizer.close()