------------------------------------------------------------------------------
-- Copyright (c) 2016, Natacha Porté --
-- --
-- Permission to use, copy, modify, and distribute this software for any --
-- purpose with or without fee is hereby granted, provided that the above --
-- copyright notice and this permission notice appear in all copies. --
-- --
-- THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES --
-- WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF --
-- MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR --
-- ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES --
-- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN --
-- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF --
-- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. --
------------------------------------------------------------------------------
------------------------------------------------------------------------------
-- Command Line Interface for primitives in Natools.Smaz.Tools. --
------------------------------------------------------------------------------
with Ada.Characters.Latin_1;
with Ada.Command_Line;
with Ada.Containers.Indefinite_Doubly_Linked_Lists;
with Ada.Containers.Indefinite_Holders;
with Ada.Streams;
with Ada.Strings.Fixed;
with Ada.Strings.Unbounded;
with Ada.Text_IO.Text_Streams;
with Natools.Getopt_Long;
with Natools.Parallelism;
with Natools.S_Expressions.Parsers;
with Natools.S_Expressions.Printers;
with Natools.Smaz;
with Natools.Smaz.Tools;
with Natools.Smaz_256;
with Natools.Smaz_Generic.Tools;
with Natools.Smaz_Tools;
with Natools.Smaz_Tools.GNAT;
with Natools.String_Escapes;
procedure Smaz is
function To_SEA (S : String) return Ada.Streams.Stream_Element_Array
renames Natools.S_Expressions.To_Atom;
package Tools_256 is new Natools.Smaz_256.Tools;
package Methods renames Natools.Smaz_Tools.Methods;
package Actions is
type Enum is
(Nothing,
Decode,
Encode,
Evaluate);
end Actions;
package Algorithms is
type Enum is
(Base_256,
Base_256_Retired);
end Algorithms;
package Dict_Sources is
type Enum is
(S_Expression,
Text_List,
Unoptimized_Text_List);
end Dict_Sources;
package Options is
type Id is
(Base_256,
Output_Ada_Dict,
Dictionary_Input,
Decode,
Encode,
Evaluate,
Filter_Threshold,
Output_Hash,
Job_Count,
Help,
Sx_Dict_Output,
Min_Sub_Size,
Max_Sub_Size,
Dict_Size,
Max_Pending,
Base_256_Retired,
Stat_Output,
No_Stat_Output,
Text_List_Input,
Fast_Text_Input,
Max_Word_Size,
Sx_Output,
No_Sx_Output,
No_Vlen_Verbatim,
Score_Method,
Vlen_Verbatim);
end Options;
package Getopt is new Natools.Getopt_Long (Options.Id);
type Callback is new Getopt.Handlers.Callback with record
Algorithm : Algorithms.Enum := Algorithms.Base_256;
Display_Help : Boolean := False;
Need_Dictionary : Boolean := False;
Stat_Output : Boolean := False;
Sx_Output : Boolean := False;
Sx_Dict_Output : Boolean := False;
Min_Sub_Size : Positive := 1;
Max_Sub_Size : Positive := 3;
Max_Word_Size : Positive := 10;
Dict_Size : Positive := 254;
Vlen_Verbatim : Boolean := True;
Max_Pending : Ada.Containers.Count_Type
:= Ada.Containers.Count_Type'Last;
Job_Count : Natural := 0;
Filter_Threshold : Natools.Smaz_Tools.String_Count := 0;
Score_Method : Methods.Enum := Methods.Encoded;
Action : Actions.Enum := Actions.Nothing;
Ada_Dictionary : Ada.Strings.Unbounded.Unbounded_String;
Hash_Package : Ada.Strings.Unbounded.Unbounded_String;
Dict_Source : Dict_Sources.Enum := Dict_Sources.S_Expression;
end record;
overriding procedure Option
(Handler : in out Callback;
Id : in Options.Id;
Argument : in String);
overriding procedure Argument
(Handler : in out Callback;
Argument : in String)
is null;
function Activate_Dictionary (Dict : in Natools.Smaz_256.Dictionary)
return Natools.Smaz_256.Dictionary;
function Activate_Dictionary (Dict : in Natools.Smaz.Dictionary)
return Natools.Smaz.Dictionary;
-- Update Dictionary.Hash so that it can be actually used
procedure Build_Perfect_Hash
(Word_List : in Natools.Smaz.Tools.String_Lists.List;
Package_Name : in String);
-- Adapter between Smaz_256 generator and retired Smaz types
procedure Convert
(Input : in Natools.Smaz_Tools.String_Lists.List;
Output : out Natools.Smaz.Tools.String_Lists.List);
-- Convert between old and new string lists
function Getopt_Config return Getopt.Configuration;
-- Build the configuration object
function Last_Code (Dict : in Natools.Smaz_256.Dictionary)
return Ada.Streams.Stream_Element
is (Dict.Last_Code);
function Last_Code (Dict : in Natools.Smaz.Dictionary)
return Ada.Streams.Stream_Element
is (Dict.Dict_Last);
-- Return the last valid entry
procedure Print_Dictionary
(Output : in Ada.Text_IO.File_Type;
Dictionary : in Natools.Smaz_256.Dictionary;
Hash_Package_Name : in String := "");
procedure Print_Dictionary
(Output : in Ada.Text_IO.File_Type;
Dictionary : in Natools.Smaz.Dictionary;
Hash_Package_Name : in String := "");
-- print the given dictionary in the given file
procedure Print_Help
(Opt : in Getopt.Configuration;
Output : in Ada.Text_IO.File_Type);
-- Print the help text to the given file
generic
type Dictionary (<>) is private;
type Dictionary_Entry is (<>);
type Methods is (<>);
type Score_Value is range <>;
type String_Count is range <>;
type Word_Counter is private;
type Dictionary_Counts is array (Dictionary_Entry) of String_Count;
with package String_Lists
is new Ada.Containers.Indefinite_Doubly_Linked_Lists (String);
with function Activate_Dictionary (Dict : in Dictionary)
return Dictionary is <>;
with procedure Add_Substrings
(Counter : in out Word_Counter;
Phrase : in String;
Min_Size : in Positive;
Max_Size : in Positive);
with procedure Add_Words
(Counter : in out Word_Counter;
Phrase : in String;
Min_Size : in Positive;
Max_Size : in Positive);
with function Append_String
(Dict : in Dictionary;
Element : in String)
return Dictionary;
with procedure Build_Perfect_Hash
(Word_List : in String_Lists.List;
Package_Name : in String);
with function Compress
(Dict : in Dictionary;
Input : in String)
return Ada.Streams.Stream_Element_Array;
with function Decompress
(Dict : in Dictionary;
Input : in Ada.Streams.Stream_Element_Array)
return String;
with function Dict_Entry
(Dict : in Dictionary;
Element : in Dictionary_Entry)
return String;
with procedure Evaluate_Dictionary
(Dict : in Dictionary;
Corpus : in String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Dictionary_Counts);
with procedure Evaluate_Dictionary_Partial
(Dict : in Dictionary;
Corpus_Entry : in String;
Compressed_Size : in out Ada.Streams.Stream_Element_Count;
Counts : in out Dictionary_Counts);
with procedure Filter_By_Count
(Counter : in out Word_Counter;
Threshold_Count : in String_Count);
with function Last_Code (Dict : in Dictionary) return Dictionary_Entry;
with procedure Print_Dictionary
(Output : in Ada.Text_IO.File_Type;
Dict : in Dictionary;
Hash_Package_Name : in String := "")
is <>;
with function Remove_Element
(Dict : in Dictionary;
Element : in Dictionary_Entry)
return Dictionary;
Score_Encoded, Score_Frequency, Score_Gain : in access function
(D : in Dictionary;
C : in Dictionary_Counts;
E : in Dictionary_Entry)
return Score_Value;
with function Simple_Dictionary
(Counter : in Word_Counter;
Word_Count : in Natural;
Method : in Methods)
return String_Lists.List;
with procedure Simple_Dictionary_And_Pending
(Counter : in Word_Counter;
Word_Count : in Natural;
Selected : out String_Lists.List;
Pending : out String_Lists.List;
Method : in Methods;
Max_Pending_Count : in Ada.Containers.Count_Type);
with function To_Dictionary
(List : in String_Lists.List;
Variable_Length_Verbatim : in Boolean)
return Dictionary;
with function Worst_Element
(Dict : in Dictionary;
Counts : in Dictionary_Counts;
Method : in Methods)
return Dictionary_Entry;
package Dictionary_Subprograms is
package Holders is new Ada.Containers.Indefinite_Holders (Dictionary);
procedure Evaluate_Dictionary
(Job_Count : in Natural;
Dict : in Dictionary;
Corpus : in String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Dictionary_Counts);
-- Dispatch to parallel or non-parallel version of
-- Evaluate_Dictionary depending on Job_Count.
function Image
(Dict : in Dictionary;
Code : in Dictionary_Entry)
return Natools.S_Expressions.Atom;
-- S-expression image of Code
procedure Optimization_Round
(Dict : in out Holders.Holder;
Score : in out Ada.Streams.Stream_Element_Count;
Counts : in out Dictionary_Counts;
Pending_Words : in out String_Lists.List;
Input_Texts : in String_Lists.List;
Job_Count : in Natural;
Method : in Methods;
Updated : out Boolean);
-- Try to improve on Dict by replacing a single entry from it with
-- one of the substring in Pending_Words.
function Optimize_Dictionary
(Base : in Dictionary;
Pending_Words : in String_Lists.List;
Input_Texts : in String_Lists.List;
Job_Count : in Natural;
Method : in Methods)
return Dictionary;
-- Optimize the dictionary on Input_Texts, starting with Base and
-- adding substrings from Pending_Words.
procedure Parallel_Evaluate_Dictionary
(Job_Count : in Positive;
Dict : in Dictionary;
Corpus : in String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Dictionary_Counts);
-- Return the same results as Natools.Smaz.Tools.Evaluate_Dictionary,
-- but hopefully more quickly, using Job_Count tasks.
procedure Print_Dictionary
(Filename : in String;
Dict : in Dictionary;
Hash_Package_Name : in String := "");
-- print the given dictionary in the given file
procedure Process
(Handler : in Callback'Class;
Word_List : in String_Lists.List;
Data_List : in String_Lists.List;
Method : in Methods);
-- Perform the requested operations
function To_Dictionary
(Handler : in Callback'Class;
Input : in String_Lists.List;
Method : in Methods)
return Dictionary;
-- Convert the input into a dictionary given the option in Handler
end Dictionary_Subprograms;
package body Dictionary_Subprograms is
procedure Evaluate_Dictionary
(Job_Count : in Natural;
Dict : in Dictionary;
Corpus : in String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Dictionary_Counts)
is
Actual_Dict : constant Dictionary := Activate_Dictionary (Dict);
begin
if Job_Count > 0 then
Parallel_Evaluate_Dictionary (Job_Count,
Actual_Dict, Corpus, Compressed_Size, Counts);
else
Evaluate_Dictionary
(Actual_Dict, Corpus, Compressed_Size, Counts);
end if;
end Evaluate_Dictionary;
function Image
(Dict : in Dictionary;
Code : in Dictionary_Entry)
return Natools.S_Expressions.Atom is
begin
return Compress (Dict, Dict_Entry (Dict, Code));
end Image;
procedure Optimization_Round
(Dict : in out Holders.Holder;
Score : in out Ada.Streams.Stream_Element_Count;
Counts : in out Dictionary_Counts;
Pending_Words : in out String_Lists.List;
Input_Texts : in String_Lists.List;
Job_Count : in Natural;
Method : in Methods;
Updated : out Boolean)
is
use type Ada.Streams.Stream_Element_Offset;
New_Value : Ada.Strings.Unbounded.Unbounded_String;
New_Position : String_Lists.Cursor;
Worst_Index : constant Dictionary_Entry
:= Worst_Element (Dict.Element, Counts, Method);
Worst_Value : constant String
:= Dict_Entry (Dict.Element, Worst_Index);
Worst_Count : constant String_Count := Counts (Worst_Index);
Base : constant Dictionary
:= Remove_Element (Dict.Element, Worst_Index);
Old_Score : constant Ada.Streams.Stream_Element_Count := Score;
begin
Updated := False;
for Position in Pending_Words.Iterate loop
declare
Word : constant String := String_Lists.Element (Position);
New_Dict : constant Dictionary := Append_String (Base, Word);
New_Score : Ada.Streams.Stream_Element_Count;
New_Counts : Dictionary_Counts;
begin
Evaluate_Dictionary
(Job_Count, New_Dict, Input_Texts, New_Score, New_Counts);
if New_Score < Score then
Dict := Holders.To_Holder (New_Dict);
Score := New_Score;
Counts := New_Counts;
New_Value
:= Ada.Strings.Unbounded.To_Unbounded_String (Word);
New_Position := Position;
Updated := True;
end if;
end;
end loop;
if Updated then
Pending_Words.Delete (New_Position);
Pending_Words.Append (Worst_Value);
Ada.Text_IO.Put_Line
(Ada.Text_IO.Current_Error,
"Removing"
& Worst_Count'Img & "x "
& Natools.String_Escapes.C_Escape_Hex (Worst_Value, True)
& ", adding"
& Counts (Last_Code (Dict.Element))'Img & "x "
& Natools.String_Escapes.C_Escape_Hex
(Ada.Strings.Unbounded.To_String (New_Value), True)
& ", size"
& Score'Img
& " ("
& Ada.Streams.Stream_Element_Offset'Image (Score - Old_Score)
& ')');
end if;
end Optimization_Round;
function Optimize_Dictionary
(Base : in Dictionary;
Pending_Words : in String_Lists.List;
Input_Texts : in String_Lists.List;
Job_Count : in Natural;
Method : in Methods)
return Dictionary
is
Holder : Holders.Holder := Holders.To_Holder (Base);
Pending : String_Lists.List := Pending_Words;
Score : Ada.Streams.Stream_Element_Count;
Counts : Dictionary_Counts;
Running : Boolean := True;
begin
Evaluate_Dictionary
(Job_Count, Base, Input_Texts, Score, Counts);
while Running loop
Optimization_Round
(Holder,
Score,
Counts,
Pending,
Input_Texts,
Job_Count,
Method,
Running);
end loop;
return Holder.Element;
end Optimize_Dictionary;
procedure Parallel_Evaluate_Dictionary
(Job_Count : in Positive;
Dict : in Dictionary;
Corpus : in String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Dictionary_Counts)
is
type Result_Values is record
Compressed_Size : Ada.Streams.Stream_Element_Count;
Counts : Dictionary_Counts;
end record;
procedure Initialize (Result : in out Result_Values);
procedure Get_Next_Job
(Global : in out String_Lists.Cursor;
Job : out String_Lists.Cursor;
Terminated : out Boolean);
procedure Do_Job
(Result : in out Result_Values;
Job : in String_Lists.Cursor);
procedure Gather_Result
(Global : in out String_Lists.Cursor;
Partial : in Result_Values);
procedure Initialize (Result : in out Result_Values) is
begin
Result := (Compressed_Size => 0,
Counts => (others => 0));
end Initialize;
procedure Get_Next_Job
(Global : in out String_Lists.Cursor;
Job : out String_Lists.Cursor;
Terminated : out Boolean) is
begin
Job := Global;
Terminated := not String_Lists.Has_Element (Global);
if not Terminated then
String_Lists.Next (Global);
end if;
end Get_Next_Job;
procedure Do_Job
(Result : in out Result_Values;
Job : in String_Lists.Cursor) is
begin
Evaluate_Dictionary_Partial
(Dict,
String_Lists.Element (Job),
Result.Compressed_Size,
Result.Counts);
end Do_Job;
procedure Gather_Result
(Global : in out String_Lists.Cursor;
Partial : in Result_Values)
is
pragma Unreferenced (Global);
use type Ada.Streams.Stream_Element_Count;
use type Natools.Smaz_Tools.String_Count;
begin
Compressed_Size := Compressed_Size + Partial.Compressed_Size;
for I in Counts'Range loop
Counts (I) := Counts (I) + Partial.Counts (I);
end loop;
end Gather_Result;
procedure Parallel_Run
is new Natools.Parallelism.Per_Task_Accumulator_Run
(String_Lists.Cursor, Result_Values, String_Lists.Cursor);
Cursor : String_Lists.Cursor := String_Lists.First (Corpus);
begin
Compressed_Size := 0;
Counts := (others => 0);
Parallel_Run (Cursor, Job_Count);
end Parallel_Evaluate_Dictionary;
procedure Print_Dictionary
(Filename : in String;
Dict : in Dictionary;
Hash_Package_Name : in String := "") is
begin
if Filename = "-" then
Print_Dictionary
(Ada.Text_IO.Current_Output, Dict, Hash_Package_Name);
elsif Filename'Length > 0 then
declare
File : Ada.Text_IO.File_Type;
begin
Ada.Text_IO.Create (File, Name => Filename);
Print_Dictionary (File, Dict, Hash_Package_Name);
Ada.Text_IO.Close (File);
end;
end if;
end Print_Dictionary;
procedure Process
(Handler : in Callback'Class;
Word_List : in String_Lists.List;
Data_List : in String_Lists.List;
Method : in Methods)
is
Dict : constant Dictionary := Activate_Dictionary
(To_Dictionary (Handler, Word_List, Method));
Sx_Output : Natools.S_Expressions.Printers.Canonical
(Ada.Text_IO.Text_Streams.Stream (Ada.Text_IO.Current_Output));
Ada_Dictionary : constant String
:= Ada.Strings.Unbounded.To_String (Handler.Ada_Dictionary);
Hash_Package : constant String
:= Ada.Strings.Unbounded.To_String (Handler.Hash_Package);
begin
if Ada_Dictionary'Length > 0 then
Print_Dictionary (Ada_Dictionary, Dict, Hash_Package);
end if;
if Hash_Package'Length > 0 then
Build_Perfect_Hash (Word_List, Hash_Package);
end if;
if Handler.Sx_Dict_Output then
Sx_Output.Open_List;
for I in Dictionary_Entry'First .. Last_Code (Dict) loop
Sx_Output.Append_String (Dict_Entry (Dict, I));
end loop;
Sx_Output.Close_List;
end if;
case Handler.Action is
when Actions.Nothing => null;
when Actions.Decode =>
if Handler.Sx_Output then
Sx_Output.Open_List;
for S of Data_List loop
Sx_Output.Append_String (Decompress (Dict, To_SEA (S)));
end loop;
Sx_Output.Close_List;
end if;
if Handler.Stat_Output then
declare
procedure Print_Line (Original, Output : Natural);
procedure Print_Line (Original, Output : Natural) is
begin
Ada.Text_IO.Put_Line
(Natural'Image (Original)
& Ada.Characters.Latin_1.HT
& Natural'Image (Output)
& Ada.Characters.Latin_1.HT
& Float'Image (Float (Original) / Float (Output)));
end Print_Line;
Original_Total : Natural := 0;
Output_Total : Natural := 0;
begin
for S of Data_List loop
declare
Original_Size : constant Natural := S'Length;
Output_Size : constant Natural
:= Decompress (Dict, To_SEA (S))'Length;
begin
Print_Line (Original_Size, Output_Size);
Original_Total := Original_Total + Original_Size;
Output_Total := Output_Total + Output_Size;
end;
end loop;
Print_Line (Original_Total, Output_Total);
end;
end if;
when Actions.Encode =>
if Handler.Sx_Output then
Sx_Output.Open_List;
for S of Data_List loop
Sx_Output.Append_Atom (Compress (Dict, S));
end loop;
Sx_Output.Close_List;
end if;
if Handler.Stat_Output then
declare
procedure Print_Line (Original, Output, Base64 : Natural);
procedure Print_Line
(Original, Output, Base64 : in Natural) is
begin
Ada.Text_IO.Put_Line
(Natural'Image (Original)
& Ada.Characters.Latin_1.HT
& Natural'Image (Output)
& Ada.Characters.Latin_1.HT
& Natural'Image (Base64)
& Ada.Characters.Latin_1.HT
& Float'Image (Float (Output) / Float (Original))
& Ada.Characters.Latin_1.HT
& Float'Image (Float (Base64) / Float (Original)));
end Print_Line;
Original_Total : Natural := 0;
Output_Total : Natural := 0;
Base64_Total : Natural := 0;
begin
for S of Data_List loop
declare
Original_Size : constant Natural := S'Length;
Output_Size : constant Natural
:= Compress (Dict, S)'Length;
Base64_Size : constant Natural
:= ((Output_Size + 2) / 3) * 4;
begin
Print_Line
(Original_Size, Output_Size, Base64_Size);
Original_Total := Original_Total + Original_Size;
Output_Total := Output_Total + Output_Size;
Base64_Total := Base64_Total + Base64_Size;
end;
end loop;
Print_Line (Original_Total, Output_Total, Base64_Total);
end;
end if;
when Actions.Evaluate =>
declare
Total_Size : Ada.Streams.Stream_Element_Count;
Counts : Dictionary_Counts;
begin
Evaluate_Dictionary (Handler.Job_Count,
Dict, Data_List, Total_Size, Counts);
if Handler.Sx_Output then
Sx_Output.Open_List;
Sx_Output.Append_String (Ada.Strings.Fixed.Trim
(Ada.Streams.Stream_Element_Count'Image (Total_Size),
Ada.Strings.Both));
for E in Dictionary_Entry'First .. Last_Code (Dict) loop
Sx_Output.Open_List;
Sx_Output.Append_Atom (Image (Dict, E));
Sx_Output.Append_String (Dict_Entry (Dict, E));
Sx_Output.Append_String (Ada.Strings.Fixed.Trim
(String_Count'Image (Counts (E)),
Ada.Strings.Both));
Sx_Output.Close_List;
end loop;
Sx_Output.Close_List;
end if;
if Handler.Stat_Output then
declare
procedure Print
(Label : in String;
E : in Dictionary_Entry;
Score : in Score_Value);
procedure Print_Min_Max
(Label : in String;
Score : not null access function
(D : in Dictionary;
C : in Dictionary_Counts;
E : in Dictionary_Entry)
return Score_Value);
procedure Print_Value
(Label : in String;
Score : not null access function
(D : in Dictionary;
C : in Dictionary_Counts;
E : in Dictionary_Entry)
return Score_Value;
Ref : in Score_Value);
procedure Print
(Label : in String;
E : in Dictionary_Entry;
Score : in Score_Value) is
begin
if Handler.Sx_Output then
Sx_Output.Open_List;
Sx_Output.Append_Atom (Image (Dict, E));
Sx_Output.Append_String (Dict_Entry (Dict, E));
Sx_Output.Append_String (Ada.Strings.Fixed.Trim
(Score'Img, Ada.Strings.Both));
Sx_Output.Close_List;
else
Ada.Text_IO.Put_Line
(Label
& Ada.Characters.Latin_1.HT
& Dictionary_Entry'Image (E)
& Ada.Characters.Latin_1.HT
& Natools.String_Escapes.C_Escape_Hex
(Dict_Entry (Dict, E), True)
& Ada.Characters.Latin_1.HT
& Score'Img);
end if;
end Print;
procedure Print_Min_Max
(Label : in String;
Score : not null access function
(D : in Dictionary;
C : in Dictionary_Counts;
E : in Dictionary_Entry)
return Score_Value)
is
Min_Score, Max_Score : Score_Value
:= Score (Dict, Counts, Dictionary_Entry'First);
S : Score_Value;
begin
for E in Dictionary_Entry'Succ
(Dictionary_Entry'First)
.. Last_Code (Dict)
loop
S := Score (Dict, Counts, E);
if S < Min_Score then
Min_Score := S;
end if;
if S > Max_Score then
Max_Score := S;
end if;
end loop;
Print_Value ("best-" & Label, Score, Max_Score);
Print_Value ("worst-" & Label, Score, Min_Score);
end Print_Min_Max;
procedure Print_Value
(Label : in String;
Score : not null access function
(D : in Dictionary;
C : in Dictionary_Counts;
E : in Dictionary_Entry)
return Score_Value;
Ref : in Score_Value) is
begin
if Handler.Sx_Output then
Sx_Output.Open_List;
Sx_Output.Append_String (Label);
end if;
for E in Dictionary_Entry'First .. Last_Code (Dict)
loop
if Score (Dict, Counts, E) = Ref then
Print (Label, E, Ref);
end if;
end loop;
if Handler.Sx_Output then
Sx_Output.Close_List;
end if;
end Print_Value;
begin
Print_Min_Max ("encoded", Score_Encoded);
Print_Min_Max ("frequency", Score_Frequency);
Print_Min_Max ("gain", Score_Gain);
end;
end if;
end;
end case;
end Process;
function To_Dictionary
(Handler : in Callback'Class;
Input : in String_Lists.List;
Method : in Methods)
return Dictionary
is
use type Natools.Smaz_Tools.String_Count;
use type Dict_Sources.Enum;
begin
case Handler.Dict_Source is
when Dict_Sources.S_Expression =>
return To_Dictionary (Input, Handler.Vlen_Verbatim);
when Dict_Sources.Text_List | Dict_Sources.Unoptimized_Text_List =>
declare
Counter : Word_Counter;
begin
for S of Input loop
Add_Substrings
(Counter, S,
Handler.Min_Sub_Size, Handler.Max_Sub_Size);
if Handler.Max_Word_Size > Handler.Max_Sub_Size then
Add_Words
(Counter, S,
Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
end if;
end loop;
if Handler.Filter_Threshold > 0 then
Filter_By_Count
(Counter, String_Count (Handler.Filter_Threshold));
end if;
if Handler.Dict_Source = Dict_Sources.Text_List then
declare
Selected, Pending : String_Lists.List;
begin
Simple_Dictionary_And_Pending
(Counter,
Handler.Dict_Size,
Selected,
Pending,
Method,
Handler.Max_Pending);
return Optimize_Dictionary
(To_Dictionary (Selected, Handler.Vlen_Verbatim),
Pending,
Input,
Handler.Job_Count,
Method);
end;
else
return To_Dictionary
(Simple_Dictionary (Counter, Handler.Dict_Size, Method),
Handler.Vlen_Verbatim);
end if;
end;
end case;
end To_Dictionary;
end Dictionary_Subprograms;
package Dict_256 is new Dictionary_Subprograms
(Dictionary => Natools.Smaz_256.Dictionary,
Dictionary_Entry => Ada.Streams.Stream_Element,
Methods => Natools.Smaz_Tools.Methods.Enum,
Score_Value => Natools.Smaz_Tools.Score_Value,
String_Count => Natools.Smaz_Tools.String_Count,
Word_Counter => Natools.Smaz_Tools.Word_Counter,
Dictionary_Counts => Tools_256.Dictionary_Counts,
String_Lists => Natools.Smaz_Tools.String_Lists,
Add_Substrings => Natools.Smaz_Tools.Add_Substrings,
Add_Words => Natools.Smaz_Tools.Add_Words,
Append_String => Tools_256.Append_String,
Build_Perfect_Hash => Natools.Smaz_Tools.GNAT.Build_Perfect_Hash,
Compress => Natools.Smaz_256.Compress,
Decompress => Natools.Smaz_256.Decompress,
Dict_Entry => Natools.Smaz_256.Dict_Entry,
Evaluate_Dictionary => Tools_256.Evaluate_Dictionary,
Evaluate_Dictionary_Partial => Tools_256.Evaluate_Dictionary_Partial,
Filter_By_Count => Natools.Smaz_Tools.Filter_By_Count,
Last_Code => Last_Code,
Remove_Element => Tools_256.Remove_Element,
Score_Encoded => Tools_256.Score_Encoded'Access,
Score_Frequency => Tools_256.Score_Frequency'Access,
Score_Gain => Tools_256.Score_Gain'Access,
Simple_Dictionary => Natools.Smaz_Tools.Simple_Dictionary,
Simple_Dictionary_And_Pending
=> Natools.Smaz_Tools.Simple_Dictionary_And_Pending,
To_Dictionary => Tools_256.To_Dictionary,
Worst_Element => Tools_256.Worst_Index);
package Dict_Retired is new Dictionary_Subprograms
(Dictionary => Natools.Smaz.Dictionary,
Dictionary_Entry => Ada.Streams.Stream_Element,
Methods => Natools.Smaz.Tools.Methods.Enum,
Score_Value => Natools.Smaz.Tools.Score_Value,
String_Count => Natools.Smaz.Tools.String_Count,
Word_Counter => Natools.Smaz.Tools.Word_Counter,
Dictionary_Counts => Natools.Smaz.Tools.Dictionary_Counts,
String_Lists => Natools.Smaz.Tools.String_Lists,
Add_Substrings => Natools.Smaz.Tools.Add_Substrings,
Add_Words => Natools.Smaz.Tools.Add_Words,
Append_String => Natools.Smaz.Tools.Append_String,
Build_Perfect_Hash => Build_Perfect_Hash,
Compress => Natools.Smaz.Compress,
Decompress => Natools.Smaz.Decompress,
Dict_Entry => Natools.Smaz.Dict_Entry,
Evaluate_Dictionary => Natools.Smaz.Tools.Evaluate_Dictionary,
Evaluate_Dictionary_Partial
=> Natools.Smaz.Tools.Evaluate_Dictionary_Partial,
Filter_By_Count => Natools.Smaz.Tools.Filter_By_Count,
Last_Code => Last_Code,
Remove_Element => Natools.Smaz.Tools.Remove_Element,
Score_Encoded => Natools.Smaz.Tools.Score_Encoded'Access,
Score_Frequency => Natools.Smaz.Tools.Score_Frequency'Access,
Score_Gain => Natools.Smaz.Tools.Score_Gain'Access,
Simple_Dictionary => Natools.Smaz.Tools.Simple_Dictionary,
Simple_Dictionary_And_Pending
=> Natools.Smaz.Tools.Simple_Dictionary_And_Pending,
To_Dictionary => Natools.Smaz.Tools.To_Dictionary,
Worst_Element => Natools.Smaz.Tools.Worst_Index);
overriding procedure Option
(Handler : in out Callback;
Id : in Options.Id;
Argument : in String) is
begin
case Id is
when Options.Help =>
Handler.Display_Help := True;
when Options.Decode =>
Handler.Need_Dictionary := True;
Handler.Action := Actions.Decode;
when Options.Encode =>
Handler.Need_Dictionary := True;
Handler.Action := Actions.Encode;
when Options.Evaluate =>
Handler.Need_Dictionary := True;
Handler.Action := Actions.Evaluate;
when Options.No_Stat_Output =>
Handler.Stat_Output := False;
when Options.No_Sx_Output =>
Handler.Sx_Output := False;
when Options.Output_Ada_Dict =>
Handler.Need_Dictionary := True;
if Argument'Length > 0 then
Handler.Ada_Dictionary
:= Ada.Strings.Unbounded.To_Unbounded_String (Argument);
else
Handler.Ada_Dictionary
:= Ada.Strings.Unbounded.To_Unbounded_String ("-");
end if;
when Options.Output_Hash =>
Handler.Need_Dictionary := True;
Handler.Hash_Package
:= Ada.Strings.Unbounded.To_Unbounded_String (Argument);
when Options.Stat_Output =>
Handler.Stat_Output := True;
when Options.Sx_Output =>
Handler.Sx_Output := True;
when Options.Dictionary_Input =>
Handler.Dict_Source := Dict_Sources.S_Expression;
when Options.Text_List_Input =>
Handler.Dict_Source := Dict_Sources.Text_List;
when Options.Fast_Text_Input =>
Handler.Dict_Source := Dict_Sources.Unoptimized_Text_List;
when Options.Sx_Dict_Output =>
Handler.Need_Dictionary := True;
Handler.Sx_Dict_Output := True;
when Options.Min_Sub_Size =>
Handler.Min_Sub_Size := Positive'Value (Argument);
when Options.Max_Sub_Size =>
Handler.Max_Sub_Size := Positive'Value (Argument);
when Options.Max_Word_Size =>
Handler.Max_Word_Size := Positive'Value (Argument);
when Options.Job_Count =>
Handler.Job_Count := Natural'Value (Argument);
when Options.Filter_Threshold =>
Handler.Filter_Threshold
:= Natools.Smaz_Tools.String_Count'Value (Argument);
when Options.Score_Method =>
Handler.Score_Method := Methods.Enum'Value (Argument);
when Options.Max_Pending =>
Handler.Max_Pending := Ada.Containers.Count_Type'Value (Argument);
when Options.Dict_Size =>
Handler.Dict_Size := Positive'Value (Argument);
when Options.Vlen_Verbatim =>
Handler.Vlen_Verbatim := True;
when Options.No_Vlen_Verbatim =>
Handler.Vlen_Verbatim := False;
when Options.Base_256 =>
Handler.Algorithm := Algorithms.Base_256;
when Options.Base_256_Retired =>
Handler.Algorithm := Algorithms.Base_256_Retired;
end case;
end Option;
function Activate_Dictionary (Dict : in Natools.Smaz_256.Dictionary)
return Natools.Smaz_256.Dictionary
is
Result : Natools.Smaz_256.Dictionary := Dict;
begin
Natools.Smaz_Tools.Set_Dictionary_For_Trie_Search
(Tools_256.To_String_List (Result));
Result.Hash := Natools.Smaz_Tools.Trie_Search'Access;
for I in Result.Offsets'Range loop
if Natools.Smaz_Tools.Trie_Search (Natools.Smaz_256.Dict_Entry
(Result, I)) /= Natural (I)
then
Ada.Text_IO.Put_Line
(Ada.Text_IO.Current_Error,
"Fail at" & Ada.Streams.Stream_Element'Image (I)
& " -> " & Natools.String_Escapes.C_Escape_Hex
(Natools.Smaz_256.Dict_Entry (Result, I), True)
& " ->" & Natural'Image (Natools.Smaz_Tools.Trie_Search
(Natools.Smaz_256.Dict_Entry (Result, I))));
end if;
end loop;
return Result;
end Activate_Dictionary;
function Activate_Dictionary (Dict : in Natools.Smaz.Dictionary)
return Natools.Smaz.Dictionary
is
Result : Natools.Smaz.Dictionary := Dict;
begin
Natools.Smaz.Tools.Set_Dictionary_For_Trie_Search (Result);
Result.Hash := Natools.Smaz.Tools.Trie_Search'Access;
for I in Result.Offsets'Range loop
if Natools.Smaz.Tools.Trie_Search (Natools.Smaz.Dict_Entry
(Result, I)) /= Natural (I)
then
Ada.Text_IO.Put_Line
(Ada.Text_IO.Current_Error,
"Fail at" & Ada.Streams.Stream_Element'Image (I)
& " -> " & Natools.String_Escapes.C_Escape_Hex
(Natools.Smaz.Dict_Entry (Result, I), True)
& " ->" & Natural'Image (Natools.Smaz.Tools.Trie_Search
(Natools.Smaz.Dict_Entry (Result, I))));
end if;
end loop;
return Result;
end Activate_Dictionary;
procedure Build_Perfect_Hash
(Word_List : in Natools.Smaz.Tools.String_Lists.List;
Package_Name : in String)
is
Other_Word_List : Natools.Smaz_Tools.String_Lists.List;
begin
for S of Word_List loop
Natools.Smaz_Tools.String_Lists.Append (Other_Word_List, S);
end loop;
Natools.Smaz_Tools.GNAT.Build_Perfect_Hash
(Other_Word_List, Package_Name);
end Build_Perfect_Hash;
procedure Convert
(Input : in Natools.Smaz_Tools.String_Lists.List;
Output : out Natools.Smaz.Tools.String_Lists.List) is
begin
Natools.Smaz.Tools.String_Lists.Clear (Output);
for S of Input loop
Natools.Smaz.Tools.String_Lists.Append (Output, S);
end loop;
end Convert;
function Getopt_Config return Getopt.Configuration is
use Getopt;
use Options;
R : Getopt.Configuration;
begin
R.Add_Option ("base-256", '2', No_Argument, Base_256);
R.Add_Option ("ada-dict", 'A', Optional_Argument, Output_Ada_Dict);
R.Add_Option ("decode", 'd', No_Argument, Decode);
R.Add_Option ("dict", 'D', No_Argument, Dictionary_Input);
R.Add_Option ("encode", 'e', No_Argument, Encode);
R.Add_Option ("evaluate", 'E', No_Argument, Evaluate);
R.Add_Option ("filter", 'F', Required_Argument, Filter_Threshold);
R.Add_Option ("help", 'h', No_Argument, Help);
R.Add_Option ("hash-pkg", 'H', Required_Argument, Output_Hash);
R.Add_Option ("jobs", 'j', Required_Argument, Job_Count);
R.Add_Option ("sx-dict", 'L', No_Argument, Sx_Dict_Output);
R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
R.Add_Option ("dict-size", 'n', Required_Argument, Dict_Size);
R.Add_Option ("max-pending", 'N', Required_Argument, Max_Pending);
R.Add_Option ("retired", 'R', No_Argument, Base_256_Retired);
R.Add_Option ("stats", 's', No_Argument, Stat_Output);
R.Add_Option ("no-stats", 'S', No_Argument, No_Stat_Output);
R.Add_Option ("text-list", 't', No_Argument, Text_List_Input);
R.Add_Option ("fast-text-list", 'T', No_Argument, Fast_Text_Input);
R.Add_Option ("max-word-len", 'W', Required_Argument, Max_Word_Size);
R.Add_Option ("s-expr", 'x', No_Argument, Sx_Output);
R.Add_Option ("no-s-expr", 'X', No_Argument, No_Sx_Output);
R.Add_Option ("no-vlen-verbatim", No_Argument, No_Vlen_Verbatim);
R.Add_Option ("score-method", Required_Argument, Score_Method);
R.Add_Option ("vlen-verbatim", No_Argument, Vlen_Verbatim);
return R;
end Getopt_Config;
procedure Print_Dictionary
(Output : in Ada.Text_IO.File_Type;
Dictionary : in Natools.Smaz_256.Dictionary;
Hash_Package_Name : in String := "")
is
procedure Put_Line (Line : in String);
procedure Put_Line (Line : in String) is
begin
Ada.Text_IO.Put_Line (Output, Line);
end Put_Line;
procedure Print_Dictionary_In_Ada is
new Tools_256.Print_Dictionary_In_Ada (Put_Line);
begin
if Hash_Package_Name'Length > 0 then
Print_Dictionary_In_Ada
(Dictionary,
Hash_Image => Hash_Package_Name & ".Hash'Access");
else
Print_Dictionary_In_Ada (Dictionary);
end if;
end Print_Dictionary;
procedure Print_Dictionary
(Output : in Ada.Text_IO.File_Type;
Dictionary : in Natools.Smaz.Dictionary;
Hash_Package_Name : in String := "")
is
procedure Put_Line (Line : in String);
procedure Put_Line (Line : in String) is
begin
Ada.Text_IO.Put_Line (Output, Line);
end Put_Line;
procedure Print_Dictionary_In_Ada is
new Natools.Smaz.Tools.Print_Dictionary_In_Ada (Put_Line);
begin
if Hash_Package_Name'Length > 0 then
Print_Dictionary_In_Ada
(Dictionary,
Hash_Image => Hash_Package_Name & ".Hash'Access");
else
Print_Dictionary_In_Ada (Dictionary);
end if;
end Print_Dictionary;
procedure Print_Help
(Opt : in Getopt.Configuration;
Output : in Ada.Text_IO.File_Type)
is
use Ada.Text_IO;
Indent : constant String := " ";
begin
Put_Line (Output, "Usage:");
for Id in Options.Id loop
Put (Output, Indent & Opt.Format_Names (Id));
case Id is
when Options.Help =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Display this help text");
when Options.Decode =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Read a list of strings and decode them");
when Options.Encode =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Read a list of strings and encode them");
when Options.No_Stat_Output =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Do not output filter statistics");
when Options.No_Sx_Output =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Do not output filtered results in a S-expression");
when Options.Output_Ada_Dict =>
Put_Line (Output, " [filename]");
Put_Line (Output, Indent & Indent
& "Output the current dictionary as Ada code in the given");
Put_Line (Output, Indent & Indent
& "file, or standard output if filename is empty or ""-""");
when Options.Output_Hash =>
Put_Line (Output, " <Hash_Package_Name>");
Put_Line (Output, Indent & Indent
& "Build a package with a perfect hash function for the");
Put_Line (Output, Indent & Indent
& "current dictionary.");
when Options.Stat_Output =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Output filter statistics");
when Options.Sx_Output =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Output filtered results in a S-expression");
when Options.Dictionary_Input =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Read dictionary directly in input S-expression (default)");
when Options.Text_List_Input =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Compute dictionary from sample texts"
& " in input S-expression");
when Options.Fast_Text_Input =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Compute dictionary from sample texts"
& " in input S-expression, without optimization");
when Options.Sx_Dict_Output =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Output the dictionary as a S-expression");
when Options.Min_Sub_Size =>
Put_Line (Output, " <length>");
Put_Line (Output, Indent & Indent
& "Minimum substring size when building a dictionary");
when Options.Max_Sub_Size =>
Put_Line (Output, " <length>");
Put_Line (Output, Indent & Indent
& "Maximum substring size when building a dictionary");
when Options.Max_Word_Size =>
Put_Line (Output, " <length>");
Put_Line (Output, Indent & Indent
& "Maximum word size when building a dictionary");
when Options.Evaluate =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Evaluate the dictionary on the input given corpus");
when Options.Job_Count =>
Put_Line (Output, " <number>");
Put_Line (Output, Indent & Indent
& "Number of parallel jobs in long calculations");
when Options.Filter_Threshold =>
Put_Line (Output, " <threshold>");
Put_Line (Output, Indent & Indent
& "Before building a dictionary from substrings, remove");
Put_Line (Output, Indent & Indent
& "substrings whose count is below the threshold.");
when Options.Score_Method =>
Put_Line (Output, " <method>");
Put_Line (Output, Indent & Indent
& "Select heuristic method to replace dictionary items"
& " during optimization");
when Options.Max_Pending =>
Put_Line (Output, " <count>");
Put_Line (Output, Indent & Indent
& "Maximum size of candidate list"
& " when building a dictionary");
when Options.Dict_Size =>
Put_Line (Output, " <count>");
Put_Line (Output, Indent & Indent
& "Number of words in the dictionary to build");
when Options.Vlen_Verbatim =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Enable variable-length verbatim in built dictionary");
when Options.No_Vlen_Verbatim =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Disable variable-length verbatim in built dictionary");
when Options.Base_256 =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Use base-256 implementation (default)");
when Options.Base_256_Retired =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Use retired base-256 implementation");
end case;
end loop;
end Print_Help;
Opt_Config : constant Getopt.Configuration := Getopt_Config;
Handler : Callback;
Input_List, Input_Data : Natools.Smaz_Tools.String_Lists.List;
begin
Process_Command_Line :
begin
Opt_Config.Process (Handler);
exception
when Getopt.Option_Error =>
Print_Help (Opt_Config, Ada.Text_IO.Current_Error);
Ada.Command_Line.Set_Exit_Status (Ada.Command_Line.Failure);
return;
end Process_Command_Line;
if Handler.Display_Help then
Print_Help (Opt_Config, Ada.Text_IO.Current_Output);
end if;
if not Handler.Need_Dictionary then
return;
end if;
if not (Handler.Stat_Output or Handler.Sx_Output) then
Handler.Sx_Output := True;
end if;
Read_Input_List :
declare
use type Actions.Enum;
Input : constant access Ada.Streams.Root_Stream_Type'Class
:= Ada.Text_IO.Text_Streams.Stream (Ada.Text_IO.Current_Input);
Parser : Natools.S_Expressions.Parsers.Stream_Parser (Input);
begin
Parser.Next;
Natools.Smaz_Tools.Read_List (Input_List, Parser);
if Handler.Action /= Actions.Nothing then
Parser.Next;
Natools.Smaz_Tools.Read_List (Input_Data, Parser);
end if;
end Read_Input_List;
case Handler.Algorithm is
when Algorithms.Base_256 =>
Dict_256.Process
(Handler, Input_List, Input_Data, Handler.Score_Method);
when Algorithms.Base_256_Retired =>
declare
Converted_Input_List : Natools.Smaz.Tools.String_Lists.List;
Converted_Input_Data : Natools.Smaz.Tools.String_Lists.List;
begin
Convert (Input_List, Converted_Input_List);
Convert (Input_Data, Converted_Input_Data);
Dict_Retired.Process
(Handler, Converted_Input_List, Converted_Input_Data,
Natools.Smaz.Tools.Methods.Enum'Val
(Natools.Smaz_Tools.Methods.Enum'Pos (Handler.Score_Method)));
end;
end case;
end Smaz;