{-# LANGUAGE ApplicativeDo #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RecordWildCards #-}
module Reanimate.Voice
( Transcript(..)
, TWord(..)
, Phone(..)
, findWord
, findWords
, loadTranscript
, fakeTranscript
, splitTranscript
, annotateWithTranscript
)
where
import Control.Monad (forM_)
import Data.Aeson
import Data.Char (isAlphaNum, isSpace)
import Data.List (sortOn)
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Maybe (listToMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.IO as T
import Reanimate.Animation (SVG, staticFrame)
import Reanimate.Constants (defaultStrokeWidth, screenHeight, screenWidth)
import Reanimate.LaTeX (latex, latexChunks)
import Reanimate.Misc (withTempFile)
import Reanimate.Scene (Scene, play, waitUntil)
import Reanimate.Svg (mkGroup, scale, translate, withStrokeColor, withStrokeWidth)
import System.Directory (doesFileExist)
import System.Exit (ExitCode (ExitFailure, ExitSuccess))
import System.FilePath (replaceExtension)
import System.IO.Unsafe (unsafePerformIO)
import System.Process (rawSystem, showCommandForUser)
data Transcript = Transcript
{ transcriptText :: Text
, transcriptKeys :: Map Text Int
, transcriptWords :: [TWord]
} deriving (Show)
instance FromJSON Transcript where
parseJSON = withObject "transcript" $ \o ->
Transcript <$> o .: "transcript" <*> pure Map.empty <*> o .: "words"
data TWord = TWord
{ wordAligned :: Text
, wordCase :: Text
, wordStart :: Double
, wordStartOffset :: Int
, wordEnd :: Double
, wordEndOffset :: Int
, wordPhones :: [Phone]
, wordReference :: Text
} deriving (Show)
instance FromJSON TWord where
parseJSON = withObject "word" $ \o ->
TWord
<$> o
.:? "alignedWord"
.!= T.empty
<*> o
.: "case"
<*> o
.:? "start"
.!= 0
<*> o
.: "startOffset"
<*> o
.:? "end"
.!= 0
<*> o
.: "endOffset"
<*> o
.:? "phones"
.!= []
<*> o
.: "word"
data Phone = Phone
{ phoneDuration :: Double
, phoneType :: Text
} deriving (Show)
instance FromJSON Phone where
parseJSON =
withObject "phone" $ \o -> Phone <$> o .: "duration" <*> o .: "phone"
findWord :: Transcript -> [Text] -> Text -> TWord
findWord t keys w = case listToMaybe (findWords t keys w) of
Nothing -> error $ "Word not in transcript: " ++ show (keys, w)
Just tword -> tword
findWords :: Transcript -> [Text] -> Text -> [TWord]
findWords t [] wd =
[ tword | tword <- transcriptWords t, wordReference tword == wd ]
findWords t (key : keys) wd =
[ tword
| tword <- findWords t keys wd
, wordStartOffset tword >= Map.findWithDefault badKey key (transcriptKeys t)
]
where badKey = error $ "Missing transcript key: " ++ show key
loadTranscript :: FilePath -> Transcript
loadTranscript path = unsafePerformIO $ do
rawTranscript <- T.readFile path
let keys = parseTranscriptKeys rawTranscript
trimTranscript = cutoutKeys keys rawTranscript
hasJSON <- doesFileExist jsonPath
transcript <- if hasJSON
then do
mbT <- decodeFileStrict jsonPath
case mbT of
Nothing -> error "bad json"
Just t -> pure t
else do
hasAudio <- findWithExtension path audioExtensions
case hasAudio of
Nothing -> return $ fakeTranscript' trimTranscript
Just audioPath -> withTempFile "txt" $ \txtPath -> do
T.writeFile txtPath trimTranscript
runGentleForcedAligner audioPath txtPath
mbT <- decodeFileStrict jsonPath
case mbT of
Nothing -> error "bad json"
Just t -> pure t
pure $ transcript { transcriptKeys = finalizeKeys keys }
where
jsonPath = replaceExtension path "json"
audioExtensions = ["mp3", "m4a", "flac"]
parseTranscriptKeys :: Text -> Map Text Int
parseTranscriptKeys = worker Map.empty 0
where
worker keys offset txt = case T.uncons txt of
Nothing -> keys
Just ('[', cs) ->
let key = T.takeWhile (/= ']') cs
newOffset = T.length key + 2
in worker (Map.insert key offset keys)
(offset + newOffset)
(T.drop newOffset txt)
Just (_, cs) -> worker keys (offset + 1) cs
finalizeKeys :: Map Text Int -> Map Text Int
finalizeKeys = Map.fromList . worker 0 . sortOn snd . Map.toList
where
worker _offset [] = []
worker offset ((key, at) : rest) =
(key, at - offset) : worker (offset + T.length key + 2) rest
cutoutKeys :: Map Text Int -> Text -> Text
cutoutKeys keys = T.concat . worker 0 (sortOn snd (Map.toList keys))
where
worker _offset [] txt = [txt]
worker offset ((key, at) : xs) txt =
let keyLen = T.length key + 2
(before, after) = T.splitAt (at - offset) txt
in before : worker (at + keyLen) xs (T.drop keyLen after)
findWithExtension :: FilePath -> [String] -> IO (Maybe FilePath)
findWithExtension _path [] = return Nothing
findWithExtension path (e : es) = do
let newPath = replaceExtension path e
hasFile <- doesFileExist newPath
if hasFile then return (Just newPath) else findWithExtension path es
runGentleForcedAligner :: FilePath -> FilePath -> IO ()
runGentleForcedAligner audioFile transcriptFile = do
ret <- rawSystem prog args
case ret of
ExitSuccess -> return ()
ExitFailure e ->
error
$ "Gentle forced aligner failed with: "
++ show e
++ "\nIs it running locally on port 8765?"
++ "\nCommand: "
++ showCommandForUser prog args
where
prog = "curl"
args =
[ "--silent"
, "--form"
, "audio=@" ++ audioFile
, "--form"
, "transcript=@" ++ transcriptFile
, "--output"
, replaceExtension audioFile "json"
, "http://localhost:8765/transcriptions?async=false"
]
data Token = TokenWord Int Int Text | TokenComma | TokenPeriod | TokenParagraph
deriving (Show)
lexText :: Text -> [Token]
lexText = worker 0
where
worker offset txt = case T.uncons txt of
Nothing -> []
Just (c, cs)
| isSpace c
-> let (w, rest) = T.span (== '\n') txt
in if T.length w >= 3
then TokenParagraph : worker (offset + T.length w) rest
else worker (offset + 1) cs
| c == '.'
-> TokenPeriod : worker (offset + 1) cs
| c == ','
-> TokenComma : worker (offset + 1) cs
| isWord c
-> let (w, rest) = T.span isWord txt
newOffset = offset + T.length w
in TokenWord offset newOffset w : worker newOffset rest
| otherwise
-> worker (offset + 1) cs
isWord c = isAlphaNum c || c `elem` ['\'', '-']
fakeTranscript :: Text -> Transcript
fakeTranscript rawTranscript =
let keys = parseTranscriptKeys rawTranscript
t = fakeTranscript' (cutoutKeys keys rawTranscript)
in t { transcriptKeys = finalizeKeys keys }
fakeTranscript' :: Text -> Transcript
fakeTranscript' input = Transcript { transcriptText = input
, transcriptKeys = Map.empty
, transcriptWords = worker 0 (lexText input)
}
where
worker _now [] = []
worker now (token : rest) = case token of
TokenWord start end w ->
let dur = realToFrac (end - start) * 0.1
in TWord { wordAligned = T.toLower w
, wordCase = "success"
, wordStart = now
, wordStartOffset = start
, wordEnd = now + dur
, wordEndOffset = end
, wordPhones = []
, wordReference = w
}
: worker (now + dur) rest
TokenComma -> worker (now + commaPause) rest
TokenPeriod -> worker (now + periodPause) rest
TokenParagraph -> worker (now + paragraphPause) rest
paragraphPause = 0.5
commaPause = 0.1
periodPause = 0.2
splitTranscript :: Transcript -> [(SVG, TWord)]
splitTranscript Transcript {..} =
[ (svg, tword)
| tword@TWord {..} <- transcriptWords
, let wordLength = wordEndOffset - wordStartOffset
[_, svg, _] = latexChunks
[ T.take wordStartOffset transcriptText
, T.take wordLength (T.drop wordStartOffset transcriptText)
, T.drop wordEndOffset transcriptText
]
]
annotateWithTranscript :: Transcript -> Scene s ()
annotateWithTranscript t = forM_ (transcriptWords t) $ \tword -> do
let svg = scale 1 $ latex (wordReference tword)
waitUntil (wordStart tword)
let dur = wordEnd tword - wordStart tword
play $ staticFrame dur $ position $ outline svg
where
position = translate (-screenWidth / 2) (-screenHeight / 2)
outline txt = mkGroup
[ withStrokeWidth (defaultStrokeWidth * 10) $ withStrokeColor "white" txt
, withStrokeWidth 0 txt
]