{-# LANGUAGE UnboxedTuples, BangPatterns #-} -- | JSON Canonicalization Scheme https://datatracker.ietf.org/doc/html/rfc8785 module Data.Aeson.RFC8785 ( encodeCanonical, ) where import Data.List (sortBy) import Data.Ord (comparing) import GHC.Integer (quotRemInteger) import Math.NumberTheory.Logarithms (integerLog10) import Data.Aeson import Data.Aeson.Encoding import Data.Aeson.Encoding.Internal import Data.Aeson.Internal.Prelude import qualified Data.Aeson.Key as Key import qualified Data.Aeson.KeyMap as KM import qualified Data.ByteString as BS import qualified Data.ByteString.Builder as B import qualified Data.ByteString.Builder.Prim as BP import qualified Data.ByteString.Lazy as LBS import qualified Data.Scientific as Sci import qualified Data.Text.Encoding as TE import qualified Data.Vector as V import qualified Data.Word8.Patterns as W8 -- $setup -- >>> import Data.Aeson -- | Encode to JSON according to RFC 8785 canonicalization scheme. -- https://datatracker.ietf.org/doc/html/rfc8785 -- -- 'encodeCanonical' uses 'toJSON' to produce intermediate 'Value', -- as 'toEncoding' may (and most likely) produces non-canonical JSON. -- -- Note: @decode (encodeCanonical v) === Just v@ for all @v :: Value@, -- i.e. 'encodeCanonical' doesn't lose any information. -- -- However, the example in RFC8785 /loses/ information as the intermediate -- number representation is 'Double', also current @toJSON :: Double -> Value@ -- sometimes produces too precise values. For example -- -- >>> toJSON (1e23 :: Double) -- Number 9.999999999999999e22 -- -- 'show' also behaves the same: -- -- >>> 1e23 :: Double -- 9.999999999999999e22 -- -- Note: RFC8785 is __not the same scheme__ as used in -- [canonical-json](https://hackage.haskell.org/package/canonical-json) package -- (https://wiki.laptop.org/go/Canonical_JSON). -- That scheme produces /invalid/ JSON (e.g. control characters encoded as is, not escaped) -- and cannot encode non-integral numbers. -- -- @since 2.2.1.0 -- encodeCanonical :: ToJSON a => a -> LBS.ByteString encodeCanonical = encodingToLazyByteString . toCanonical . toJSON toCanonical :: Value -> Encoding toCanonical Null = null_ toCanonical (Bool b) = bool b toCanonical (Number n) = canonicalNumber n toCanonical (String s) = canonicalString s toCanonical (Array v) = list toCanonical (V.toList v) toCanonical (Object m) = dict (canonicalString . Key.toText) toCanonical ifr $ sortBy (\(k1, _) (k2, _) -> propertyCmp k1 k2) (KM.toList m) ifr :: (k -> v -> a -> a) -> a -> [(k, v)] -> a ifr f z = foldr (\(k, v) -> f k v) z {-# INLINE ifr #-} -- Property name strings to be sorted are formatted as arrays of UTF-16 code units. propertyCmp :: Key -> Key -> Ordering propertyCmp = comparing f where -- this is slow implementation, but it's obviously not wrong. f :: Key -> BS.ByteString f = TE.encodeUtf16BE . Key.toText -- strings are already serialized canonically. canonicalString :: Text -> Encoding' a canonicalString = text -- RFC 8785 is outsourcing number format to ECMA-262. -- 10th edition, 7.1.12.1 NumberToString -- https://262.ecma-international.org/10.0/#sec-tostring-applied-to-the-number-type -- -- Note: this specification is not lossy -- Given 'Scientific' we can choose n,k,s uniquely: 'nks'. -- -- RFC8785 Appendix D says "don't use bignums". canonicalNumber :: Scientific -> Encoding canonicalNumber m = case compare m 0 of EQ -> Encoding (B.word8 W8.DIGIT_0) LT -> Encoding (B.word8 W8.HYPHEN <> fromEncoding (canonicalNumber' (negate m))) GT -> canonicalNumber' m -- input: Positive number canonicalNumber' :: Scientific -> Encoding canonicalNumber' m | k <= n, n <= 21 = Encoding $ BP.primMapListFixed BP.word8 ds <> BP.primMapListFixed BP.word8 (replicate (n - k) W8.DIGIT_0) | 0 < n, n <= 21 , let (pfx, sfx) = splitAt n ds = Encoding $ BP.primMapListFixed BP.word8 pfx <> B.word8 W8.PERIOD <> BP.primMapListFixed BP.word8 sfx | -6 < n, n <= 0 = Encoding $ B.word8 W8.DIGIT_0 <> B.word8 W8.PERIOD <> BP.primMapListFixed BP.word8 (replicate (negate n) W8.DIGIT_0) <> BP.primMapListFixed BP.word8 ds | k == 1, [d] <- ds = Encoding $ B.word8 d <> B.word8 W8.LOWER_E <> B.word8 (if (n - 1) >= 0 then W8.PLUS else W8.HYPHEN) <> BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1))) | (d:ds') <- ds = Encoding $ B.word8 d <> B.word8 W8.PERIOD <> BP.primMapListFixed BP.word8 ds' <> B.word8 W8.LOWER_E <> B.word8 (if (n - 1) >= 0 then W8.PLUS else W8.HYPHEN) <> BP.primMapListFixed BP.word8 (integerToDecimalDigits (abs (toInteger n - 1))) | otherwise = string "0" -- shouldn't happen, but we need a default case. where -- 5. Otherwise, let n, k, and s be integers such that -- k ≥ 1, 10k - 1 ≤ s < 10k, the Number value for s × 10n - k is m, -- and k is as small as possible. -- Note that k is the number of digits in the decimal representation of s, -- that s is not divisible by 10, and that the least significant digit of s -- is not necessarily uniquely determined by these criteria. (n, k, s) = nks m ds = integerToDecimalDigits s -- 5. Otherwise, let n, k, and s be integers such that k ≥ 1, 10^(k - 1) ≤ s < 10^k, -- the Number value for s × 10^(n - k) is m, and k is as small as possible. -- Note that k is the number of digits in the decimal representation of s, -- that s is not divisible by 10, and that the least significant digit of s -- is not necessarily uniquely determined by these criteria. nks :: Scientific -> (Int, Int, Integer) nks m = (e + k, k, c) where m' = Sci.normalize m c = Sci.coefficient m' e = Sci.base10Exponent m' k = integerLog10 c + 1 integerToDecimalDigits :: Integer -> [Word8] integerToDecimalDigits = go [] where go acc 0 = acc go acc i = case quotRemInteger i 10 of (# q, r #) -> go (d:acc) q where !d = fromIntegral r + W8.DIGIT_0