语音识别

各大厂商对比

https://blog.csdn.net/weixin_40875934/article/details/88536331

https://blog.csdn.net/thinktothings/article/details/97886877

1、腾讯

文档

腾讯云：https://console.cloud.tencent.com/asr

腾讯云maven仓库地址：https://mvnrepository.com/artifact/com.tencentcloudapi/tencentcloud-sdk-java

快速入门：https://cloud.tencent.com/document/product/1093/35691

javaSDK：https://cloud.tencent.com/document/product/1093/35735

代码

java

public class TencentSpeechTest {

    @Test
    public void wenjian() throws IOException {
        //采用本地语音上传方式调用
        try{
            //重要，此处<Your SecretId><Your SecretKey>需要替换成客户自己的账号信息，获取方法：
            //https://cloud.tencent.com/document/product/441/6203
            //具体路径：点控制台右上角您的账号-->选：访问管理-->点左边菜单的：访问密钥-->API 密钥管理
            Credential cred = new Credential("", "");

            HttpProfile httpProfile = new HttpProfile();
            httpProfile.setEndpoint("asr.tencentcloudapi.com");

            ClientProfile clientProfile = new ClientProfile();
            clientProfile.setHttpProfile(httpProfile);
            clientProfile.setSignMethod("TC3-HMAC-SHA256");
            AsrClient client = new AsrClient(cred, "ap-shanghai", clientProfile);

            String params = "{\"ProjectId\":0,\"SubServiceType\":2,\"EngSerViceType\":\"16k_zh\",\"SourceType\":1,\"Url\":\"\",\"VoiceFormat\":\"mp3\",\"UsrAudioKey\":\"session-123\"}";
            SentenceRecognitionRequest req = SentenceRecognitionRequest.fromJsonString(params, SentenceRecognitionRequest.class);

            File file = new File("C:\\Users\\xxl\\Desktop\\语音识别\\百度\\test\\out2.mp3");
            FileInputStream inputFile = new FileInputStream(file);
            byte[] buffer = new byte[(int)file.length()];
            req.setDataLen(file.length());
            inputFile.read(buffer);
            inputFile.close();
            String encodeData = Base64.getEncoder().encodeToString(buffer);
            req.setData(encodeData);

            SentenceRecognitionResponse resp = client.SentenceRecognition(req);

            System.out.println(SentenceRecognitionRequest.toJsonString(resp));
        } catch (TencentCloudSDKException e) {
            System.out.println(e.toString());
        }
    }

}

2、百度

文档

快速入门：https://ai.baidu.com/ai-doc/SPEECH/pk4o0bkx8

教程：https://blog.csdn.net/u011560555/article/details/100037272

视频教程：https://www.bilibili.com/video/BV1Qi4y1G7jo

百度智能云：https://console.bce.baidu.com/ai/#/ai/speech/app/list

示例音频：https://ai.baidu.com/ai-doc/SPEECH/7k38lxpwf

格式要求：https://ai.baidu.com/ai-doc/SPEECH/Vk38lxily

代码

Java使用ffmoeg转码（mp3转码pcm与切割音频）：https://my.oschina.net/jiangqw/blog/3026135

Java实现剪切歌曲：https://blog.csdn.net/zuo19980407/article/details/106102158

Java使用IO流实现音频的剪切和拼接：https://blog.csdn.net/aofuqian6717/article/details/102091707

markdown

mp3转pcm使用范例：
ffmpeg -y -i out2.mp3 -f s16be -ar 16000 -ac 1 -acodec pcm_s16be pcm16k.pcm
ffmpeg -y -i out2.mp3 -acodec pcm_s16le -f s16le -ac 1 -ar 16000 pcm16k.pcm
第一个有问题
说明:
    1. -y：允许覆盖
    2. -i test.mp3：源文件
    3. -acodec pcm_s16be：编码器，输出pcm格式，采用signed 16编码，字节序为大尾端（小尾端为le)；
    4. -f s16le：强制文件格式
    5. -ac 1: 声道数为1        -ac 2: 双声道
    6. -ar 16000: 采样率为16000
    
视频提取音频：
	ffmpeg -i temp.mp4 -f mp3 -vn temp.mp3

代码

java

//设置APPID/AK/SK
public static final String APP_ID = "";
public static final String API_KEY = "";
public static final String SECRET_KEY = "";
//private static final String testFileName = "C:\\Users\\xxl\\Desktop\\语音识别\\百度\\public\\8k.wav"; // 百度语音提供技术支持
public static final String testFileName = "C:\\Users\\xxl\\Desktop\\语音识别\\百度\\test\\16k.pcm"; // 百度语音提供技术支持

@Test
public void yuyin() {
    // 初始化一个AipSpeech
    AipSpeech client = new AipSpeech(APP_ID, API_KEY, SECRET_KEY);

    // 可选：设置网络连接参数
    client.setConnectionTimeoutInMillis(2000);
    client.setSocketTimeoutInMillis(60000);

    // 可选：设置代理服务器地址, http和socket二选一，或者均不设置
    //client.setHttpProxy("proxy_host", proxy_port);  // 设置http代理
    //client.setSocketProxy("proxy_host", proxy_port);  // 设置socket代理

    // 可选：设置log4j日志输出格式，若不设置，则使用默认配置
    // 也可以直接通过jvm启动参数设置此环境变量
    //System.setProperty("aip.log4j.conf", "path/to/your/log4j.properties");

    // 调用接口
    JSONObject res = client.asr(testFileName, "pcm", 16000, null);
    try {
        System.out.println(res.toString(2));
    } catch (JSONException e) {
        e.printStackTrace();
    }
}

/**
 * 剪切歌曲时间
 */
@Test
public void cut(){
    String sourcefile = "C:\\Users\\xxl\\Desktop\\语音识别\\百度\\test\\苏州园林.mp3";
    File file = new File(sourcefile);
    String sourcefile2 = "C:\\Users\\xxl\\Desktop\\语音识别\\百度\\test\\out2.mp3";
    File file2 = new File(sourcefile2);

    BufferedInputStream bis1 = null;
    BufferedOutputStream bos = null;
    //剪切部分起始字节
    //int start1 = 327680;//128kbps（比特率）*20s*1024/8=327680 比特率可以查看音频属性获知
    //int end1 = 409600;//128kbps*25s*1024/8=409600
    int start1 = 95232;//124*6*1024/8   95232
    int end1 = 158720;//124*10*1024/8   158720

    int tatol1 = 0;
    try {
        //输入流
        bis1 = new BufferedInputStream(new FileInputStream(file));
        //缓冲字节输出流（true表示可以在流的后面追加数据，而不是覆盖！！）
        bos = new BufferedOutputStream(new FileOutputStream(file2,true));

        //剪切、写入
        byte[] b1= new byte[512];
        int len1 = 0;
        while((len1 = bis1.read(b1))!=-1){
            tatol1+=len1;   //累积tatol
            if(tatol1<start1 ){  //tatol小于起始值则跳出本次循环
                continue;
            }
            bos.write(b1);   //写入的都是在我们预先指定的字节范围之内
            if(tatol1>=end1 ){  //当tatol的值超过预先设定的范围，则立刻刷新bos流对象，并结束循环
                bos.flush();
                break;
            }
        }
        System.out.println("剪切完成！");
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }finally{
        try {//切记要关闭流！！
            if(bis1!=null) bis1.close();
            if(bos!=null) bos.close();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
}

/**
 * MP3转换PCM文件方法
 */
@Test
public void mp32pcm() {
    String mp3filePath = "C:\\Users\\xxl\\Desktop\\语音识别\\百度\\test\\out2.mp3";
    String pcmFilePath = "C:\\Users\\xxl\\Desktop\\语音识别\\百度\\test\\16k.pcm";
    //String command = "ffmpeg -y -i mp3filePath -acodec pcm_s16le -f s16le -ac 1 -ar 16000 pcmFilePath";
    try {
        String command1 = "ffmpeg -y -i ";
        String command2 = " -acodec pcm_s16le -f s16le -ac 1 -ar 16000 ";

        Runtime runtime = Runtime.getRuntime();
        Process exec = runtime.exec(command1 + mp3filePath + command2 + pcmFilePath);
        exec.waitFor();
        exec.destroy();
        System.out.println("MP3转换PCM文件 成功");
    } catch (Exception e) {
        System.out.println("MP3转换PCM文件 失败");
    }
}

报错

官方错误信息：https://ai.baidu.com/ai-doc/SPEECH/Yk4o0bkop

1、3305：baidu-aip语音识别错误request pv too much

{
'err_msg': 'request pv too much', 
'err_no': 3305, 
'sn': '876137091191590632079'
}

出现这种错误{request pv too much', 'err_no': 3305, 'sn': '386874002531595317040'} 原因是

如果是第一次使用，是因为还未领取接口的免费次数，在控制台--语音技术--概览处领取接口的免费次数。

如果不是第一次使用，那么代表免费次数已经耗尽，在相同位置开通接口的付费功能即可。

2、3307：语音识别接口总是返回 3307错误

为百度服务器繁忙，也有可能是音频质量过差。建议核对好音频文件的各项参数。

3307应是音频质量问题，可以先测试官方示例音频，如果没有问题再对比官方文档对音频格式的要求并检查是否有真人声音。

3、3312：错误，说明音频格式参数 format 不正确，确定不是 mp3 文件吗。

4、3301：音频质量问题，导致识别内容为空。

3、科大讯飞

语音听写 Java SDK 文档 | 讯飞开放平台文档中心

4、阿里云

阿里云实时语音识别：https://help.aliyun.com/document_detail/84430.html

语音识别 ​

1、腾讯 ​

文档 ​

代码 ​

2、百度 ​