From 55c6fca04ebfe68837ead9826cb584f8c8982781 Mon Sep 17 00:00:00 2001 From: ssbothwell Date: Thu, 4 Nov 2021 20:45:08 -0700 Subject: [PATCH 1/2] Changes encodeChar to return a NonEmpty list --- Codec/Binary/UTF8/String.hs | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/Codec/Binary/UTF8/String.hs b/Codec/Binary/UTF8/String.hs index a576355..05bb13f 100644 --- a/Codec/Binary/UTF8/String.hs +++ b/Codec/Binary/UTF8/String.hs @@ -26,6 +26,7 @@ module Codec.Binary.UTF8.String ( , utf8Encode ) where +import qualified Data.List.NonEmpty as NE import Data.Word (Word8,Word32) import Data.Bits ((.|.),(.&.),shiftL,shiftR) import Data.Char (chr,ord) @@ -46,30 +47,30 @@ replacement_character :: Char replacement_character = '\xfffd' -- | Encode a single Haskell 'Char' to a list of 'Word8' values, in UTF8 format. -encodeChar :: Char -> [Word8] -encodeChar = map fromIntegral . go . ord +encodeChar :: Char -> NE.NonEmpty Word8 +encodeChar = fmap fromIntegral . go . ord where go oc - | oc <= 0x7f = [oc] + | oc <= 0x7f = oc NE.:| + [] - | oc <= 0x7ff = [ 0xc0 + (oc `shiftR` 6) - , 0x80 + oc .&. 0x3f - ] + | oc <= 0x7ff = 0xc0 + (oc `shiftR` 6) NE.:| + [ 0x80 + oc .&. 0x3f ] - | oc <= 0xffff = [ 0xe0 + (oc `shiftR` 12) - , 0x80 + ((oc `shiftR` 6) .&. 0x3f) - , 0x80 + oc .&. 0x3f - ] - | otherwise = [ 0xf0 + (oc `shiftR` 18) - , 0x80 + ((oc `shiftR` 12) .&. 0x3f) - , 0x80 + ((oc `shiftR` 6) .&. 0x3f) - , 0x80 + oc .&. 0x3f - ] + | oc <= 0xffff = 0xe0 + (oc `shiftR` 12) NE.:| + [ 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] + | otherwise = 0xf0 + (oc `shiftR` 18) NE.:| + [ 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ] -- | Encode a Haskell 'String' to a list of 'Word8' values, in UTF8 format. encode :: String -> [Word8] -encode = concatMap encodeChar +encode = concatMap (NE.toList . encodeChar) -- -- | Decode a UTF8 string packed into a list of 'Word8' values, directly to 'String' From 5f02adfeaf2f632b2bc99a04584d208d094e856b Mon Sep 17 00:00:00 2001 From: ssbothwell Date: Fri, 5 Nov 2021 10:42:27 -0700 Subject: [PATCH 2/2] Adds CPP to modify encodeChar for ghc7 --- Codec/Binary/UTF8/String.hs | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/Codec/Binary/UTF8/String.hs b/Codec/Binary/UTF8/String.hs index 05bb13f..94ba5f3 100644 --- a/Codec/Binary/UTF8/String.hs +++ b/Codec/Binary/UTF8/String.hs @@ -26,7 +26,9 @@ module Codec.Binary.UTF8.String ( , utf8Encode ) where +#if __GLASGOW_HASKELL__ > 710 import qualified Data.List.NonEmpty as NE +#endif import Data.Word (Word8,Word32) import Data.Bits ((.|.),(.&.),shiftL,shiftR) import Data.Char (chr,ord) @@ -47,6 +49,28 @@ replacement_character :: Char replacement_character = '\xfffd' -- | Encode a single Haskell 'Char' to a list of 'Word8' values, in UTF8 format. +#if __GLASGOW_HASKELL__ < 802 +encodeChar :: Char -> (Word8, [Word8]) +encodeChar = (\(x, xs) -> (fromIntegral x, fmap fromIntegral xs)) . go . ord + where + go oc + | oc <= 0x7f = ( oc + , []) + + | oc <= 0x7ff = ( 0xc0 + (oc `shiftR` 6) + , [ 0x80 + oc .&. 0x3f ]) + + | oc <= 0xffff = ( 0xe0 + (oc `shiftR` 12) + , [ 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ]) + + | otherwise = ( 0xf0 + (oc `shiftR` 18) + , [ 0x80 + ((oc `shiftR` 12) .&. 0x3f) + , 0x80 + ((oc `shiftR` 6) .&. 0x3f) + , 0x80 + oc .&. 0x3f + ]) +#else encodeChar :: Char -> NE.NonEmpty Word8 encodeChar = fmap fromIntegral . go . ord where @@ -61,16 +85,21 @@ encodeChar = fmap fromIntegral . go . ord [ 0x80 + ((oc `shiftR` 6) .&. 0x3f) , 0x80 + oc .&. 0x3f ] + | otherwise = 0xf0 + (oc `shiftR` 18) NE.:| [ 0x80 + ((oc `shiftR` 12) .&. 0x3f) , 0x80 + ((oc `shiftR` 6) .&. 0x3f) , 0x80 + oc .&. 0x3f ] - +#endif -- | Encode a Haskell 'String' to a list of 'Word8' values, in UTF8 format. encode :: String -> [Word8] +#if __GLASGOW_HASKELL__ < 802 +encode = concatMap ((\(x, xs) -> x:xs) . encodeChar) +#else encode = concatMap (NE.toList . encodeChar) +#endif -- -- | Decode a UTF8 string packed into a list of 'Word8' values, directly to 'String'