Natools

Diff
Login

Differences From Artifact [ddb34c3613]:

To Artifact [083f639e5f]:


10
11
12
13
14
15
16






























17
18
19
20
21
22
23
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53







+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+







-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR  --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES   --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN    --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF  --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.           --
------------------------------------------------------------------------------

------------------------------------------------------------------------------
-- Natools.Smaz is a re-implementation of the short string compression      --
-- algorithm "Smaz" by Salvatore Sanfilippo                                 --
-- (see https://github.com/antirez/smaz).                                   --
-- Its main selling point is its simplicity and CPU performance. However    --
-- the implementation here emphasizes correctness (which greatly benefits   --
-- from simplicity) over performance (so no benchmarks have been made).     --
--                                                                          --
-- The basic idea behind the algorithm is that bytes in the encoded (and    --
-- hopefully compressed) message are indexes in a static compiled-in        --
-- dictionary, and two special byte values to mark verbatim data.           --
--                                                                          --
-- For example, using original Smaz dictionary, the string "Athe33" is      --
-- encoded as (254, 65, 1, 255, 1, 51, 51), which can be broken down as:    --
--   * 254 to mark the following byte as verbatim                           --
--   * 65 which is verbatim byte for 'A'                                    --
--   * 1 to mark the second word in the dictionary: "the"                   --
--   * 255 to mark variable-length verbatim escape                          --
--   * 1 to encoding the length of the verbatim fragment: 2 bytes           --
--   * 51, 51 the verbatim bytes for "33".                                  --
--                                                                          --
-- Note that the encoder has been improved over the original Smaz encoder,  --
-- in that it merges adjacent verbatim fragments when it makes the output   --
-- smaller. For example, with the input 5-byte string "33 33", the original --
-- naive encoder would produce the 9-byte output                            --
-- (255, 1, 51, 51, 0, 255, 1, 51, 51), while encoder here would encode the --
-- whole string in a single verbatim fragment, leading to the 7-byte output --
-- (255, 4, 51, 51, 32, 51, 51).                                            --
------------------------------------------------------------------------------

with Ada.Streams;

package Natools.Smaz is
   pragma Pure (Natools.Smaz);

   use type Ada.Streams.Stream_Element;

43
44
45
46
47
48
49

50
51
52
53
54
55

56
57
58

59
60
61
62
63
64

65
66
67
68
69
70

71
72
73
74

75
76
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112







+






+



+






+






+




+


                  in 1 .. Dictionary.Max_Word_Length));


   function Compressed_Upper_Bound
     (Dict : in Dictionary;
      Input : in String)
     return Ada.Streams.Stream_Element_Count;
      --  Return the maximum number of bytes needed to encode Input

   procedure Compress
     (Dict : in Dictionary;
      Input : in String;
      Output_Buffer : out Ada.Streams.Stream_Element_Array;
      Output_Last : out Ada.Streams.Stream_Element_Offset);
      --  Encode Input into Output_Buffer

   function Compress (Dict : in Dictionary; Input : in String)
     return Ada.Streams.Stream_Element_Array;
      --  Return an encoded buffer for Input


   function Decompressed_Length
     (Dict : in Dictionary;
      Input : in Ada.Streams.Stream_Element_Array)
     return Natural;
      --  Return the exact length when Input is decoded

   procedure Decompress
     (Dict : in Dictionary;
      Input : in Ada.Streams.Stream_Element_Array;
      Output_Buffer : out String;
      Output_Last : out Natural);
      --  Decode Input into Output_Buffer

   function Decompress
     (Dict : in Dictionary; Input : in Ada.Streams.Stream_Element_Array)
     return String;
      --  Return a decoded buffer for Input

end Natools.Smaz;