iFlytek 音声合成 API を介して音声と字幕 (SRT) ファイルを同期的に生成

iFlytek 音声 API

iFlytek の音声合成 API は合成音声を出力することしかできず、対応する字幕 (srt) ファイルを直接生成することはできません。ただし、API は音声生成プロセス中に処理されたバイト数を同期的に返します。

1
2


data.ced	string	合成进度，指当前合成文本的字节数
注：请注意合成是以句为单位切割的，若文本只有一句话，则每次返回结果的ced是相同的。

このフィールドを通じて、必要な srt 字幕ファイルを生成できます。

踏まれた穴は

PCM ファイルは API を通じて返される必要があります。 MP3 ファイルを返すことを選択した場合、mp3 ファイルの長さを計算するときにエラーが発生します。つまり、各 mp3 セグメントの長さを合計し、生成されるファイルの合計の長さになる必要があります。ただし、実際にはまったく一致しないことが判明し、最終的に生成される srt ファイルのタイムラインが不正確になります。次の mp3 Python ライブラリは正しいとは見なされません。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10


    #使用mutagen.mp3
    from mutagen.mp3 import MP3
    audio = MP3(filename)
    duration = audio.info.length

    #使用ffmpeg
    duration = ffmpeg.probe(filename)['format']['duration']

    #使用eyed3
    duration = eyed3.load(filename).info.time_secs

返された mp3 の形式に何か問題があるはずです

###Pythonコード

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


# -*- coding:utf-8 -*-
import websocket
import datetime
import hashlib
import base64
import hmac
import json
from urllib.parse import urlencode
import time
import ssl
from wsgiref.handlers import format_date_time
from datetime import datetime
from time import mktime
import _thread as thread
import os
import shutil
import ffmpeg
import sys

wsParam = None

class Const() :
    PCM_FOLDER = './pcm'

class Ws_Param(object):
    # 初始化
    def __init__(self, APPID, APIKey, APISecret, Text):
        self.APPID = APPID
        self.APIKey = APIKey
        self.APISecret = APISecret
        self.Text = Text

        # 公共参数(common)
        self.CommonArgs = {"app_id": self.APPID}
        # 业务参数(business)，更多个性化参数可在官网查看
        self.BusinessArgs = {"aue": "raw", "auf": "audio/L16;rate=16000", "vcn": "aisjiuxu", "volume":80, "pitch":25, "tte":"utf8"}
        self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-8')), "UTF8")}
        #使用小语种须使用以下方式，此处的unicode指的是 utf16小端的编码方式，即"UTF-16LE"”
        #self.Data = {"status": 2, "text": str(base64.b64encode(self.Text.encode('utf-16')), "UTF8")}

    # 生成url
    def create_url(self):
        url = 'wss://tts-api.xfyun.cn/v2/tts'
        # 生成RFC1123格式的时间戳
        now = datetime.now()
        date = format_date_time(mktime(now.timetuple()))

        # 拼接字符串
        signature_origin = "host: " + "ws-api.xfyun.cn" + "\n"
        signature_origin += "date: " + date + "\n"
        signature_origin += "GET " + "/v2/tts " + "HTTP/1.1"
        # 进行hmac-sha256进行加密
        signature_sha = hmac.new(self.APISecret.encode('utf-8'), signature_origin.encode('utf-8'),
                                 digestmod=hashlib.sha256).digest()
        signature_sha = base64.b64encode(signature_sha).decode(encoding='utf-8')

        authorization_origin = "api_key=\"%s\", algorithm=\"%s\", headers=\"%s\", signature=\"%s\"" % (
            self.APIKey, "hmac-sha256", "host date request-line", signature_sha)
        authorization = base64.b64encode(authorization_origin.encode('utf-8')).decode(encoding='utf-8')
        # 将请求的鉴权参数组合为字典
        v = {
            "authorization": authorization,
            "date": date,
            "host": "ws-api.xfyun.cn"
        }
        # 拼接鉴权参数，生成url
        url = url + '?' + urlencode(v)
        # print("date: ",date)
        # print("v: ",v)
        # 此处打印出建立连接时候的url,参考本demo的时候可取消上方打印的注释，比对相同参数时生成的url与自己代码生成的url是否一致
        # print('websocket url :', url)
        return url

def on_message(ws, message):
    try:
        message =json.loads(message)
        code = message["code"]
        sid = message["sid"]
        audio = message["data"]["audio"]
        audio = base64.b64decode(audio)
        status = message["data"]["status"]
        ced = message["data"]["ced"]
        print(message)
        if status == 2:
            print("ws is closed")
            ws.close()

        if code != 0:
            errMsg = message["message"]
            print("sid:%s call error:%s code is:%s" % (sid, errMsg, code))
        else:
            with open(Const.PCM_FOLDER + '/v_'+ ced.zfill(8) +'.pcm', 'ab') as f:
                f.write(audio)

            with open(Const.PCM_FOLDER + '/all.pcm', 'ab') as fa:
                fa.write(audio)

    except Exception as e:
        print("receive msg,but parse exception:", e)

def on_error(ws, error):
    print("### error:", error)

def on_close(ws):
    print("### closed ###")

def on_open(ws):
    def run(*args):
        d = {"common": wsParam.CommonArgs,
             "business": wsParam.BusinessArgs,
             "data": wsParam.Data,
             }
        d = json.dumps(d)
        print("------>开始发送文本数据")
        ws.send(d)
    thread.start_new_thread(run, ())

def read_text() :
    with open('content.txt','r',encoding='utf-8') as f:
        content = f.read()
    return content

def pcm_duration(filename):
    fsize = os.path.getsize(filename)
    duration = fsize / 16000 / 2
    print(duration)
    return float(duration)

def format_mtime(mtime) :
    hh = int(mtime / 3600000)
    mm = int((mtime % 3600000) / 60000)
    ss = int((mtime % 60000) / 1000)
    sss = int(mtime % 1000)
    text = ('%02d' % hh) + ':' + ('%02d' % mm) + ':' + ('%02d' % ss) + ',' + ('%03d' % sss)
    return(text)

def init():
    global wsParam
    if os.path.exists(Const.PCM_FOLDER):
        shutil.rmtree(Const.PCM_FOLDER)
    time.sleep(1)
    os.makedirs(Const.PCM_FOLDER)
    time.sleep(1)

    text = read_text()
    print(text)
    wsParam = Ws_Param(APPID='XXXXX', APISecret='XXXXXXXXXXXXXXXXXXXXXXXXXX',
                       APIKey='XXXXXXXXXXXXXXXXXXXXXXXXXXXX',
                       Text=text)

def create_pcm():
    global wsParam
    websocket.enableTrace(False)
    wsUrl = wsParam.create_url()
    ws = websocket.WebSocketApp(wsUrl, on_message=on_message, on_error=on_error, on_close=on_close)
    ws.on_open = on_open
    ws.run_forever(sslopt={"cert_reqs": ssl.CERT_NONE})

def create_srt() :
    global wsParam

    prevId = 1
    prevCed = '0'
    prevDuration = 0

    text = read_text()
    print(text)

    for file in os.listdir(Const.PCM_FOLDER):
        if (not os.path.isdir(file)) and (file.startswith('v_')) and (file.endswith('.pcm')):
            print('process file ' + file)
            duration = pcm_duration(Const.PCM_FOLDER + '/' + file)
            ced = file[2:-4]
            print('ced=' + ced)
            if duration > 0.5 :
                with open('content.srt', 'a') as fb:
                    line1 = str(prevId)
                    line2 = format_mtime(int(round(prevDuration * 1000))) + ' --> ' + format_mtime(int(round((prevDuration + duration) * 1000)))
                    line3 = text.encode('utf-8')[int(prevCed):int(ced)].decode('utf-8')
                    print(line1)
                    print(line2)
                    print(line3)
                    fb.write(line1 + '\n')
                    fb.write(line2 + '\n')
                    fb.write(line3 + '\n')
                    fb.write('\n')
                prevId += 1
            prevDuration += duration
            prevCed = ced
    print('prevDuration=' + str(prevDuration))

def convert_mp3():
    cmd='ffmpeg -y -f s16le -ac 1 -ar 16000 -acodec pcm_s16le -i ' + Const.PCM_FOLDER + '/all.pcm all.mp3'
    os.system(cmd)

if __name__ == "__main__":
    init()
    create_pcm()
    create_srt()
    convert_mp3()