whisperを使用した文字起こし¶
whisper¶
OpenAIが開発した音声認識ソフトウェア
準備¶
GPU ( CUDA ver., nvidia-smi ) を使う. ( pyTorchでubuntu )
python -m pip install openai-whisper
$ sudo apt install ffmpeg
$ python -m pip install openai-whisper
コード¶
import os, sys, math, subprocess, time, tqdm
import whisper
import pydub
# ========================================================= #
# === whisper__fromSound.py === #
# ========================================================= #
def whisper__fromSound( inpFile=None, outFile=None, model="small", bitrate="64k", work_dir="work_dir", logFile="whisper.log" ):
# ------------------------------------------------- #
# --- [1] arguments --- #
# ------------------------------------------------- #
print( "\n -- Begining of Transcription -- \n" )
start_time = time.time()
os.makedirs( work_dir, exist_ok=True )
if ( inpFile is None ): sys.exit( "[whisper__fromSound.py] inpFile == ???" )
if ( outFile is None ): outFile = os.path.splitext( os.path.basename( inpFile ) )[0] + ".txt"
# ------------------------------------------------- #
# --- [2] preparation --- #
# ------------------------------------------------- #
# -- [2-1] conversion into .mp3 -- #
if not( inpFile.lower().endswith(".mp3") ):
mp3File = os.path.join( work_dir, os.path.splitext( os.path.basename( inpFile ) )[0] + ".mp3" )
cmd = f"ffmpeg -y -i {inpFile} -b:a {bitrate} {mp3File}"
with open( logFile, "w" ) as lf:
subprocess.run( cmd, shell=True, stdout=lf, stderr=lf )
else:
mp3File = inpFile
# -- [2-2] division of .mp3 -- #
audio = pydub.AudioSegment.from_file( mp3File )
duration_sec = len( audio ) / 1000
segment_length_sec = 600 # 10分
num_segments = math.ceil( duration_sec / segment_length_sec )
segment_files = []
for ik in range( num_segments ):
start_ms = ik * segment_length_sec * 1000
end_ms = min( (ik+1)*segment_length_sec*1000, len(audio) )
segment = audio[start_ms:end_ms]
segment_path = os.path.join( work_dir, f"segment_{ik+1:02d}.mp3" )
segment.export( segment_path, format="mp3", bitrate=bitrate )
segment_files += [ segment_path ]
# ------------------------------------------------- #
# --- [3] transcription --- #
# ------------------------------------------------- #
# -- [3-1] load model -- #
wmodel = whisper.load_model( model )
# -- [3-2] transcribe each segment -- #
full_transcription = ""
for ik, segment_path in enumerate( tqdm.tqdm( segment_files, desc="Transcribing" ) ):
# -- transcribe each file -- #
result = wmodel.transcribe( segment_path, fp16=False )
text = result['text']
full_transcription += text + "\n"
# -- save mid file -- #
txt_path = os.path.join( work_dir, f"segment_{ik+1:02d}.txt" )
with open( txt_path, "w", encoding="utf-8" ) as f:
f.write(text)
# -- [3-3] concatenate all -- #
with open( outFile, "w", encoding="utf-8" ) as f:
f.write( full_transcription )
elapsed = time.time() - start_time
print( "\n - ellapsed time :: {} (s)\n".format( elapsed ) )
print( " -- End of Transcription -- \n" )
return( full_transcription )
# ========================================================= #
# === Execution of Pragram === #
# ========================================================= #
if ( __name__=="__main__" ):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument( "--inpFile" , help=" input file name.", default=None )
parser.add_argument( "--outFile" , help="output file name.", default=None )
parser.add_argument( "--bitrate" , help="bit rate", default="64k" )
parser.add_argument( "--work_dir", help="work directory", default="work_dir" )
parser.add_argument( "--logFile" , help="log file for ffmpeg", default="ffmpeg.log" )
parser.add_argument( "--model" , help="model type [tiny, base, small medium, large]", \
default="small" )
args = parser.parse_args()
ret = whisper__fromSound( inpFile=args.inpFile, outFile=args.outFile, model=args.model, \
bitrate=args.bitrate, work_dir =args.work_dir, logFile=args.logFile )
print( " [transcription] {}".format( ret ) )
使用例¶
python whisper__fromSound.py --inpFile input.wav --model small
オプション引数¶
引数 |
値 |
---|---|
inpFile |
入力ファイル ( e.g.: .mp3 ) |
outFile |
出力ファイル : default = inpFileの拡張子次第 |
model |
モデルサイズ: tiny, base, small, medium, large |
bitrate |
ビットレート ( default. 64k ) |
work_dir |
一時ディレクトリ場所 |
logFile |
ログファイル名 ( default. whisper.log ) |