{-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ApplicativeDo #-} {-# LANGUAGE RecordWildCards #-} {-| Reanimate can automatically synchronize animations to your voice if you have a transcript and an audio recording. This works with the help of Gentle (). Accuracy is not perfect but it is pretty close, and it is by far the easiest way of adding narration to an animation. -} module Reanimate.Voice ( Transcript(..) , TWord(..) , Phone(..) , findWord -- :: Transcript -> [Text] -> Text -> TWord , findWords -- :: Transcript -> [Text] -> Text -> [TWord] , loadTranscript -- :: FilePath -> Transcript , fakeTranscript -- :: Text -> Transcript , splitTranscript -- :: Transcript -> SVG -> [(SVG, TWord)] , annotateWithTranscript -- :: Transcript -> Scene s () ) where import Data.Aeson import Data.Char import System.IO.Unsafe ( unsafePerformIO ) import System.Directory import System.FilePath import System.Process import System.Exit import Control.Monad import Data.List import Data.Maybe import qualified Data.Map as Map import Data.Map ( Map ) import Data.Text ( Text ) import qualified Data.Text as T import qualified Data.Text.IO as T import Reanimate.Misc import Reanimate.LaTeX import Reanimate.Scene import Reanimate.Animation import Reanimate.Svg import Reanimate.Constants -- | Aligned transcript. Contains the transcript text as well as -- timing data for each word. data Transcript = Transcript { transcriptText :: Text , transcriptKeys :: Map Text Int , transcriptWords :: [TWord] } deriving (Show) instance FromJSON Transcript where parseJSON = withObject "transcript" $ \o -> Transcript <$> o .: "transcript" <*> pure Map.empty <*> o .: "words" -- | Spoken word. Includes information about when it was spoken, -- its duration, and its phonemes. data TWord = TWord { wordAligned :: Text , wordCase :: Text , wordStart :: Double -- ^ Start of pronunciation in seconds , wordStartOffset :: Int -- ^ Character index of word in transcript , wordEnd :: Double -- ^ End of pronunciation in seconds , wordEndOffset :: Int -- ^ Last character index of word in transcript , wordPhones :: [Phone] , wordReference :: Text -- ^ The word being pronounced. } deriving (Show) instance FromJSON TWord where parseJSON = withObject "word" $ \o -> TWord <$> o .:? "alignedWord" .!= T.empty <*> o .: "case" <*> o .:? "start" .!= 0 <*> o .: "startOffset" <*> o .:? "end" .!= 0 <*> o .: "endOffset" <*> o .:? "phones" .!= [] <*> o .: "word" -- | Phoneme type data Phone = Phone { phoneDuration :: Double , phoneType :: Text } deriving (Show) instance FromJSON Phone where parseJSON = withObject "phone" $ \o -> Phone <$> o .: "duration" <*> o .: "phone" -- | Locate the first word that occurs after all the given keys. -- An error is thrown if no such word exists. An error is thrown -- if the keys do not exist in the transcript. findWord :: Transcript -> [Text] -> Text -> TWord findWord t keys w = case listToMaybe (findWords t keys w) of Nothing -> error $ "Word not in transcript: " ++ show (keys, w) Just tword -> tword -- | Locate all words that occur after all the given keys. -- May return an empty list. An error is thrown -- if the keys do not exist in the transcript. findWords :: Transcript -> [Text] -> Text -> [TWord] findWords t [] wd = [ tword | tword <- transcriptWords t, wordReference tword == wd ] findWords t (key : keys) wd = [ tword | tword <- findWords t keys wd , wordStartOffset tword >= Map.findWithDefault badKey key (transcriptKeys t) ] where badKey = error $ "Missing transcript key: " ++ show key -- | Loading a transcript does three things depending on which files are available -- with the same basename as the input argument: -- 1. If a JSON file is available, it is parsed and returned. -- 2. If an audio file is available, reanimate tries to align it by calling out to -- Gentle on localhost:8765/. If Gentle is not running, an error will be thrown. -- 3. If only the text transcript is available, a fake transcript is returned, -- with timings roughly at 120 words per minute. loadTranscript :: FilePath -> Transcript loadTranscript path = unsafePerformIO $ do rawTranscript <- T.readFile path let keys = parseTranscriptKeys rawTranscript trimTranscript = cutoutKeys keys rawTranscript hasJSON <- doesFileExist jsonPath transcript <- if hasJSON then do mbT <- decodeFileStrict jsonPath case mbT of Nothing -> error "bad json" Just t -> pure t else do hasAudio <- findWithExtension path audioExtensions case hasAudio of Nothing -> return $ fakeTranscript' trimTranscript Just audioPath -> withTempFile "txt" $ \txtPath -> do T.writeFile txtPath trimTranscript runGentleForcedAligner audioPath txtPath mbT <- decodeFileStrict jsonPath case mbT of Nothing -> error "bad json" Just t -> pure t pure $ transcript { transcriptKeys = finalizeKeys keys } where jsonPath = replaceExtension path "json" audioExtensions = ["mp3", "m4a", "flac"] parseTranscriptKeys :: Text -> Map Text Int parseTranscriptKeys = worker Map.empty 0 where worker keys offset txt = case T.uncons txt of Nothing -> keys Just ('[', cs) -> let key = T.takeWhile (/= ']') cs newOffset = T.length key + 2 in worker (Map.insert key offset keys) (offset + newOffset) (T.drop newOffset txt) Just (_, cs) -> worker keys (offset + 1) cs finalizeKeys :: Map Text Int -> Map Text Int finalizeKeys = Map.fromList . worker 0 . sortOn snd . Map.toList where worker _offset [] = [] worker offset ((key, at) : rest) = (key, at - offset) : worker (offset + T.length key + 2) rest cutoutKeys :: Map Text Int -> Text -> Text cutoutKeys keys = T.concat . worker 0 (sortOn snd (Map.toList keys)) where worker _offset [] txt = [txt] worker offset ((key, at) : xs) txt = let keyLen = T.length key + 2 (before, after) = T.splitAt (at - offset) txt in before : worker (at + keyLen) xs (T.drop keyLen after) findWithExtension :: FilePath -> [String] -> IO (Maybe FilePath) findWithExtension _path [] = return Nothing findWithExtension path (e : es) = do let newPath = replaceExtension path e hasFile <- doesFileExist newPath if hasFile then return (Just newPath) else findWithExtension path es runGentleForcedAligner :: FilePath -> FilePath -> IO () runGentleForcedAligner audioFile transcriptFile = do ret <- rawSystem prog args case ret of ExitSuccess -> return () ExitFailure e -> error $ "Gentle forced aligner failed with: " ++ show e ++ "\nIs it running locally on port 8765?" ++ "\nCommand: " ++ showCommandForUser prog args where prog = "curl" args = [ "--silent" , "--form" , "audio=@" ++ audioFile , "--form" , "transcript=@" ++ transcriptFile , "--output" , replaceExtension audioFile "json" , "http://localhost:8765/transcriptions?async=false" ] data Token = TokenWord Int Int Text | TokenComma | TokenPeriod | TokenParagraph deriving (Show) lexText :: Text -> [Token] lexText = worker 0 where worker offset txt = case T.uncons txt of Nothing -> [] Just (c, cs) | isSpace c -> let (w, rest) = T.span (== '\n') txt in if T.length w >= 3 then TokenParagraph : worker (offset + T.length w) rest else worker (offset + 1) cs | c == '.' -> TokenPeriod : worker (offset + 1) cs | c == ',' -> TokenComma : worker (offset + 1) cs | isWord c -> let (w, rest) = T.span isWord txt newOffset = offset + T.length w in TokenWord offset newOffset w : worker newOffset rest | otherwise -> worker (offset + 1) cs isWord c = isAlphaNum c || c `elem` ['\'', '-'] -- | Fake transcript timings at roughly 120 words per minute. fakeTranscript :: Text -> Transcript fakeTranscript rawTranscript = let keys = parseTranscriptKeys rawTranscript t = fakeTranscript' (cutoutKeys keys rawTranscript) in t { transcriptKeys = finalizeKeys keys } fakeTranscript' :: Text -> Transcript fakeTranscript' input = Transcript { transcriptText = input , transcriptKeys = Map.empty , transcriptWords = worker 0 (lexText input) } where worker _now [] = [] worker now (token : rest) = case token of TokenWord start end w -> let dur = realToFrac (end - start) * 0.1 in TWord { wordAligned = T.toLower w , wordCase = "success" , wordStart = now , wordStartOffset = start , wordEnd = now + dur , wordEndOffset = end , wordPhones = [] , wordReference = w } : worker (now + dur) rest TokenComma -> worker (now + commaPause) rest TokenPeriod -> worker (now + periodPause) rest TokenParagraph -> worker (now + paragraphPause) rest paragraphPause = 0.5 commaPause = 0.1 periodPause = 0.2 -- | Convert the transcript text to an SVG image using LaTeX and associate -- each word image with its timing information. splitTranscript :: Transcript -> [(SVG, TWord)] splitTranscript Transcript {..} = [ (svg, tword) | tword@TWord {..} <- transcriptWords , let wordLength = wordEndOffset - wordStartOffset [_, svg, _] = latexChunks [ T.take wordStartOffset transcriptText , T.take wordLength (T.drop wordStartOffset transcriptText) , T.drop wordEndOffset transcriptText ] ] -- | Helper function for rendering a transcript. annotateWithTranscript :: Transcript -> Scene s () annotateWithTranscript t = forM_ (transcriptWords t) $ \tword -> do let svg = scale 1 $ latex (wordReference tword) waitUntil (wordStart tword) let dur = wordEnd tword - wordStart tword play $ staticFrame dur $ position $ outline svg where position = translate (-screenWidth / 2) (-screenHeight / 2) outline txt = mkGroup [ withStrokeWidth (defaultStrokeWidth * 10) $ withStrokeColor "white" txt , withStrokeWidth 0 txt ]