Index: tools/smaz.adb ================================================================== --- tools/smaz.adb +++ tools/smaz.adb @@ -18,10 +18,11 @@ -- Command Line Interface for primitives in Natools.Smaz.Tools. -- ------------------------------------------------------------------------------ with Ada.Characters.Latin_1; with Ada.Command_Line; +with Ada.Containers.Indefinite_Holders; with Ada.Streams; with Ada.Strings.Fixed; with Ada.Strings.Unbounded; with Ada.Text_IO.Text_Streams; with Natools.Getopt_Long; @@ -34,10 +35,13 @@ procedure Smaz is function To_SEA (S : String) return Ada.Streams.Stream_Element_Array renames Natools.S_Expressions.To_Atom; + package Holders is new Ada.Containers.Indefinite_Holders + (Natools.Smaz.Dictionary, Natools.Smaz."="); + package Actions is type Enum is (Nothing, Decode, Encode, @@ -45,11 +49,12 @@ end Actions; package Dict_Sources is type Enum is (S_Expression, - Text_List); + Text_List, + Unoptimized_Text_List); end Dict_Sources; package Options is type Id is (Output_Ada_Dict, @@ -65,10 +70,11 @@ Min_Sub_Size, Max_Sub_Size, Stat_Output, No_Stat_Output, Text_List_Input, + Fast_Text_Input, Max_Word_Size, Sx_Output, No_Sx_Output); end Options; @@ -112,10 +118,30 @@ -- depending on Job_Count. function Getopt_Config return Getopt.Configuration; -- Build the configuration object + procedure Optimization_Round + (Dict : in out Holders.Holder; + Score : in out Ada.Streams.Stream_Element_Count; + Counts : in out Natools.Smaz.Tools.Dictionary_Counts; + Pending_Words : in out Natools.Smaz.Tools.String_Lists.List; + Input_Texts : in Natools.Smaz.Tools.String_Lists.List; + Job_Count : in Natural; + Updated : out Boolean); + -- Try to improve on Dict by replacing a single entry from it with + -- one of the substring in Pending_Words. + + function Optimize_Dictionary + (Base : in Natools.Smaz.Dictionary; + Pending_Words : in Natools.Smaz.Tools.String_Lists.List; + Input_Texts : in Natools.Smaz.Tools.String_Lists.List; + Job_Count : in Natural) + return Natools.Smaz.Dictionary; + -- Optimize the dictionary on Input_Texts, starting with Base and + -- adding substrings from Pending_Words. + procedure Parallel_Evaluate_Dictionary (Job_Count : in Positive; Dict : in Natools.Smaz.Dictionary; Corpus : in Natools.Smaz.Tools.String_Lists.List; Compressed_Size : out Ada.Streams.Stream_Element_Count; @@ -198,10 +224,13 @@ Handler.Dict_Source := Dict_Sources.S_Expression; when Options.Text_List_Input => Handler.Dict_Source := Dict_Sources.Text_List; + when Options.Fast_Text_Input => + Handler.Dict_Source := Dict_Sources.Unoptimized_Text_List; + when Options.Sx_Dict_Output => Handler.Need_Dictionary := True; Handler.Sx_Dict_Output := True; when Options.Min_Sub_Size => @@ -277,17 +306,68 @@ R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size); R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size); R.Add_Option ("stats", 's', No_Argument, Stat_Output); R.Add_Option ("no-stats", 'S', No_Argument, No_Stat_Output); R.Add_Option ("text-list", 't', No_Argument, Text_List_Input); + R.Add_Option ("fast-text-list", 'T', No_Argument, Fast_Text_Input); R.Add_Option ("max-word-len", 'W', Required_Argument, Max_Word_Size); R.Add_Option ("s-expr", 'x', No_Argument, Sx_Output); R.Add_Option ("no-s-expr", 'X', No_Argument, No_Sx_Output); return R; end Getopt_Config; + + procedure Optimization_Round + (Dict : in out Holders.Holder; + Score : in out Ada.Streams.Stream_Element_Count; + Counts : in out Natools.Smaz.Tools.Dictionary_Counts; + Pending_Words : in out Natools.Smaz.Tools.String_Lists.List; + Input_Texts : in Natools.Smaz.Tools.String_Lists.List; + Job_Count : in Natural; + Updated : out Boolean) + is + pragma Unreferenced (Dict); + pragma Unreferenced (Score); + pragma Unreferenced (Counts); + pragma Unreferenced (Pending_Words); + pragma Unreferenced (Input_Texts); + pragma Unreferenced (Job_Count); + begin + Updated := False; + end Optimization_Round; + + + function Optimize_Dictionary + (Base : in Natools.Smaz.Dictionary; + Pending_Words : in Natools.Smaz.Tools.String_Lists.List; + Input_Texts : in Natools.Smaz.Tools.String_Lists.List; + Job_Count : in Natural) + return Natools.Smaz.Dictionary + is + Holder : Holders.Holder := Holders.To_Holder (Base); + Pending : Natools.Smaz.Tools.String_Lists.List := Pending_Words; + Score : Ada.Streams.Stream_Element_Count; + Counts : Natools.Smaz.Tools.Dictionary_Counts; + Running : Boolean := True; + begin + Evaluate_Dictionary (Job_Count, Base, Input_Texts, Score, Counts); + + while Running loop + Optimization_Round + (Holder, + Score, + Counts, + Pending, + Input_Texts, + Job_Count, + Running); + end loop; + + return Holder.Element; + end Optimize_Dictionary; + procedure Parallel_Evaluate_Dictionary (Job_Count : in Positive; Dict : in Natools.Smaz.Dictionary; Corpus : in Natools.Smaz.Tools.String_Lists.List; @@ -493,10 +573,16 @@ New_Line (Output); Put_Line (Output, Indent & Indent & "Compute dictionary from sample texts" & " in input S-expression"); + when Options.Fast_Text_Input => + New_Line (Output); + Put_Line (Output, Indent & Indent + & "Compute dictionary from sample texts" + & " in input S-expression, without optimization"); + when Options.Sx_Dict_Output => New_Line (Output); Put_Line (Output, Indent & Indent & "Output the dictionary as a S-expression"); @@ -532,23 +618,25 @@ Put_Line (Output, Indent & Indent & "substrings whose count is below the threshold."); end case; end loop; end Print_Help; + function To_Dictionary (Handler : in Callback'Class; Input : in Natools.Smaz.Tools.String_Lists.List) return Natools.Smaz.Dictionary is use type Natools.Smaz.Tools.String_Count; + use type Dict_Sources.Enum; begin case Handler.Dict_Source is when Dict_Sources.S_Expression => return Natools.Smaz.Tools.To_Dictionary (Input, True); - when Dict_Sources.Text_List => + when Dict_Sources.Text_List | Dict_Sources.Unoptimized_Text_List => declare Counter : Natools.Smaz.Tools.Word_Counter; begin for S of Input loop Natools.Smaz.Tools.Add_Substrings @@ -564,13 +652,28 @@ if Handler.Filter_Threshold > 0 then Natools.Smaz.Tools.Filter_By_Count (Counter, Handler.Filter_Threshold); end if; - return Natools.Smaz.Tools.To_Dictionary - (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254), - True); + if Handler.Dict_Source = Dict_Sources.Text_List then + declare + Selected, Pending : Natools.Smaz.Tools.String_Lists.List; + begin + Natools.Smaz.Tools.Simple_Dictionary_And_Pending + (Counter, 254, Selected, Pending); + + return Optimize_Dictionary + (Natools.Smaz.Tools.To_Dictionary (Selected, True), + Pending, + Input, + Handler.Job_Count); + end; + else + return Natools.Smaz.Tools.To_Dictionary + (Natools.Smaz.Tools.Simple_Dictionary (Counter, 254), + True); + end if; end; end case; end To_Dictionary; Opt_Config : constant Getopt.Configuration := Getopt_Config;