I get an error that I’m not in a position to see any useful resource across the web for. I attempted a number of various things however none are in a position to resolve this subject: Audio enter timestamp overlaps or precedes prior audio enter
Context: I used to be taking part in round with some code on my Mac, and I maintain getting the error after half a minute or so if operating this code which transcribes speech to textual content utilizing the Apple Speech framework’s SpeechTranscriber. I’ve added a number of feedback to assist clarify the code too. Any assist could be significantly appreciated! I get the identical subject with DictationTranscriber as effectively, which made me really feel like the problem is someplace in my inputNode.installTap() code block, however I simply can’t determine it out
non-public func startRecording() async
{
do {
// 1. Request microphone entry
let permissionGranted = await AVAudioApplication.requestRecordPermission()
guard permissionGranted else {
transcribedText = "Microphone entry denied."
return
}
let targetLocale = Locale(identifier: selectedLanguage)
// 2 & 3. Create the suitable transcriber kind safely
var resolvedTranscriber: SpeechModule? = nil
var isUsingSpeechTranscriber = false
// Strive SpeechTranscriber first if supported equal asset is confirmed
if let supportedLocale = await SpeechTranscriber.supportedLocale(equivalentTo: targetLocale) {
do {
// Reserve the asset to register the bundle subscription
attempt await AssetInventory.reserve(locale: supportedLocale)
let transcriberInstance = SpeechTranscriber(locale: supportedLocale, preset: .progressiveTranscription)
// 4. Obtain and Set up Property (Essential test inside a sandbox do-catch)
if let request = attempt await AssetInventory.assetInstallationRequest(supporting: [transcriberInstance]) {
isDownloadingAssets = true
transcribedText = "Downloading language belongings..."
attempt await request.downloadAndInstall()
isDownloadingAssets = false
}
resolvedTranscriber = transcriberInstance
isUsingSpeechTranscriber = true
print("Efficiently initialized high-end SpeechTranscriber for (supportedLocale.identifier)")
} catch {
isDownloadingAssets = false
print("SpeechTranscriber subscription or obtain failed: (error.localizedDescription)")
print("Falling again to DictationTranscriber for system-supported language workflow.")
}
}
guard let transcriberModule = resolvedTranscriber else { return }
transcribedText = "Listening..."
var inputNode = engine.inputNode
var recordingFormat = inputNode.inputFormat(forBus: 0)
let (inputSequence, builder) = AsyncStream.makeStream(of: AnalyzerInput.self)
self.inputBuilder = builder
if let selectedInputID = selectedInputID, let audioUnit = inputNode.audioUnit {
print("-A1")
var deviceID = selectedInputID
let error = AudioUnitSetProperty(
audioUnit,
kAudioOutputUnitProperty_CurrentDevice,
kAudioUnitScope_Global,
0,
&deviceID,
UInt32(MemoryLayout.measurement)
)
if error != noErr {
print("Warning: Couldn't set enter system (Error (error))")
}
// Inform the engine to flush modifications and adapt to the brand new {hardware} system
engine.reset()
engine.put together()
inputNode.removeTap(onBus: 0)
inputNode = engine.inputNode //
recordingFormat = inputNode.inputFormat(forBus: 0)
print("Recording format pattern fee: (recordingFormat.sampleRate)Hz") // It will now accurately output 96000Hz!
}
// 5. Setup AsyncStream for Audio Enter
// Faucet the microphone to get audio buffers
let hardwareFormat = inputNode.inputFormat(forBus: 0)
guard let targetSpeechFormat = AVAudioFormat(
commonFormat: .pcmFormatInt16,
sampleRate: 16000.0, // CRITICAL: Downsample to 16kHz
channels: 1, // Dictation works greatest with mono (1 channel)
interleaved: false
) else {
print("Error creating goal speech format.")
return
}
// 3. Initialize the audio converter that handles each format conversion AND downsampling
guard let audioConverter = AVAudioConverter(from: hardwareFormat, to: targetSpeechFormat) else {
print("Couldn't create AVAudioConverter for 16kHz conversion.")
return
}
print("A0")
self.processedConvertedFrames = 0
// 2. Replace your faucet block
inputNode.installTap(onBus: 0, bufferSize: 4096, format: hardwareFormat) { [weak self] (incomingBuffer, when) in
guard let self = self else { return }
let sampleRateRatio = targetSpeechFormat.sampleRate / hardwareFormat.sampleRate
let targetFrameCapacity = AVAudioFrameCount(Double(incomingBuffer.frameLength) * sampleRateRatio)
guard let convertedBuffer = AVAudioPCMBuffer(pcmFormat: targetSpeechFormat, frameCapacity: targetFrameCapacity) else {
return
}
var error: NSError? = nil
var hasProvidedData = false // Monitor if we have already handed the converter our knowledge
let standing = audioConverter.convert(to: convertedBuffer, error: &error) { inNumPackets, outStatus in
// If the converter asks for MORE knowledge in the identical faucet cycle, we should inform it to attend.
if hasProvidedData {
outStatus.pointee = .noDataNow
return nil
}
outStatus.pointee = .haveData
hasProvidedData = true
return incomingBuffer
}
if standing == .error || error != nil {
print("Conversion failed: (error?.localizedDescription ?? "Unknown error")")
return
}
// Reject empty buffers!
// A 0-length buffer generates an similar timestamp to the earlier body, inflicting the Code 2 Crash.
guard convertedBuffer.frameLength > 0 else {
return
}
// Calculate exact timeline based mostly on whole processed frames
let currentSeconds = Double(self.processedConvertedFrames) / targetSpeechFormat.sampleRate
let cmTime = CMTime(seconds: currentSeconds, preferredTimescale: Int32(targetSpeechFormat.sampleRate))
let inputSample = AnalyzerInput(buffer: convertedBuffer, bufferStartTime: cmTime)
self.inputBuilder?.yield(inputSample)
// Increment ONLY by the frames that have been efficiently created
self.processedConvertedFrames += Int64(convertedBuffer.frameLength)
}
self.transcriptionTask = Process { @MainActor in
print("A2")
// Reset our textual content buffer when a brand new session begins
self.finalizedTextHistory = ""
var lastSegmentEndTime: TimeInterval = Date().timeIntervalSince1970
do {
if isUsingSpeechTranscriber, let speechTranscriber = transcriberModule as? SpeechTranscriber {
for attempt await end in speechTranscriber.outcomes {
let activeText = String(outcome.textual content.characters)
if outcome.isFinal {
self.finalizedTextHistory += activeText
self.transcribedText = self.finalizedTextHistory
} else {
self.transcribedText = self.finalizedTextHistory + activeText
}
}
}
} catch {
print("Transcription stream threw an error: (error)")
}
}
print("A4")
engine.put together()
do {
attempt engine.begin()
print("Audio Engine began efficiently!")
} catch {
print("Error beginning audio engine: (error.localizedDescription)")
}
isRecording = true
transcribedText = "" // Clear outdated textual content
//
// // 6. Setup Analyzer with the polymorphically typed transcriberModule
let analyzer = SpeechAnalyzer(modules: [transcriberModule])
self.analyzer = analyzer
analysisTask = Process {
do {
let lastSampleTime = attempt await analyzer.analyzeSequence(inputSequence)
if let time = lastSampleTime {
attempt await analyzer.finalizeAndFinish(by: time)
} else {
await analyzer.cancelAndFinishNow()
}
} catch {
print("Evaluation failed: (error)")
}
}
} catch {
print("Failed to start out recording setup: (error)")
stopRecording()
}
}
That is the logs in my console after I run this, the error reveals up 30-60 seconds after it has began and works effectively for a bit:
Recording format pattern fee: 96000.0Hz
A0
A4
Audio Engine began efficiently!
A2
A3
SpeechAnalyzer: Enter loop ending with error: Error Area=SFSpeechErrorDomain Code=2 "Audio enter timestamp overlaps or precedes prior audio enter" UserInfo={NSLocalizedDescription=Audio enter timestamp overlaps or precedes prior audio enter}
Evaluation failed: Error Area=SFSpeechErrorDomain Code=2 "Audio enter timestamp overlaps or precedes prior audio enter" UserInfo={NSLocalizedDescription=Audio enter timestamp overlaps or precedes prior audio enter}
Transcription stream threw an error: Error Area=SFSpeechErrorDomain Code=2 "Audio enter timestamp overlaps or precedes prior audio enter" UserInfo={NSLocalizedDescription=Audio enter timestamp overlaps or precedes prior audio enter}
