{- ----------------------------------------------------------------------------- Copyright 2020 Kevin P. Barry Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ----------------------------------------------------------------------------- -} -- Author: Kevin P. Barry [ta0kira@gmail.com] -- | Language-specific hyphenation rules. {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE Safe #-} module WEditorHyphen.LangHyphen ( LangHyphen, langHyphen, ) where import Data.Char import Text.Hyphenation import WEditor.LineWrap data LangHyphen = LangHyphen Language Hyphenator -- | Hyphenates words using 'Language'-specific rules. -- -- Example usage: -- -- @ -- import Text.Hyphenation -- import WEditor.Document -- import WEditor.LineWrap -- import WEditorHyphen.LangHyphen -- -- content = map UnparsedPara (lines "Your document content.") -- -- doc = editDocument (breakWords (langHyphen English_US)) content -- @ langHyphen :: Language -> LangHyphen langHyphen l = LangHyphen l (languageHyphenator l) -- Private below here. instance Show LangHyphen where show (LangHyphen l _) = show l instance WordSplitter LangHyphen Char where splitWord (LangHyphen l h) k w cs | w < (minWidth l) || k > w = Nothing | k >= length cs || k < 3 = Just [] | otherwise = Just breaks where (nb,cs',ne) = trimPunct l cs (n0:ns) = map length $ hyphenate h cs' breaks -- Move the word to the next line if it has punctuation in the middle. | any (noSplitChars l) cs' = [] | null ns = [] | otherwise = combine k (nb+n0) (init ns ++ [ne+last ns]) combine _ _ [] = [] combine t n (k:ks) -- Add a break if adding a segment would exceed the remaining space. | (n+k > t-(length (hyphenChar l)) && not (null ks)) || n+k > t = n:(combine w k ks) -- Append the next segment to the current segment. | otherwise = combine w (n+k) ks isWordChar (LangHyphen l _) = wordChars l isWhitespace (LangHyphen l _) = whitespaceChars l appendHyphen (LangHyphen l _) = (++ hyphenChar l) minWidth :: Language -> Int minWidth _ = 8 wordChars :: Language -> Char -> Bool wordChars l c = generalCategory c `elem` cats l || noSplitChars l c where -- Add language-specific tokenizing rules here. cats _ = [UppercaseLetter, LowercaseLetter, TitlecaseLetter, ModifierLetter, OtherLetter, NonSpacingMark, SpacingCombiningMark, DashPunctuation] noSplitChars :: Language -> Char -> Bool noSplitChars l c = generalCategory c `elem` cats l where -- Add language-specific punctuation rules here. cats _ = [DecimalNumber, OtherNumber, ConnectorPunctuation, InitialQuote, FinalQuote, OtherPunctuation, CurrencySymbol] whitespaceChars :: Language -> Char -> Bool whitespaceChars _ c = isSeparator c hyphenChar :: Language -> [Char] hyphenChar _ = "-" trimPunct :: Language -> [Char] -> (Int,[Char],Int) trimPunct l cs = (length $ takeWhile (noSplitChars l) cs, dropWhile (noSplitChars l) $ reverse $ dropWhile (noSplitChars l) $ reverse cs, length $ takeWhile (noSplitChars l) $ reverse cs)