16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
|
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
+
+
+
+
-
+
+
+
|
------------------------------------------------------------------------------
-- Command Line Interface for primitives in Natools.Smaz.Tools. --
------------------------------------------------------------------------------
with Ada.Characters.Latin_1;
with Ada.Command_Line;
with Ada.Containers.Indefinite_Holders;
with Ada.Streams;
with Ada.Strings.Fixed;
with Ada.Strings.Unbounded;
with Ada.Text_IO.Text_Streams;
with Natools.Getopt_Long;
with Natools.Parallelism;
with Natools.S_Expressions.Parsers;
with Natools.S_Expressions.Printers;
with Natools.Smaz.Tools;
with Natools.Smaz.Tools.GNAT;
with Natools.String_Escapes;
procedure Smaz is
function To_SEA (S : String) return Ada.Streams.Stream_Element_Array
renames Natools.S_Expressions.To_Atom;
package Holders is new Ada.Containers.Indefinite_Holders
(Natools.Smaz.Dictionary, Natools.Smaz."=");
package Actions is
type Enum is
(Nothing,
Decode,
Encode,
Evaluate);
end Actions;
package Dict_Sources is
type Enum is
(S_Expression,
Text_List);
Text_List,
Unoptimized_Text_List);
end Dict_Sources;
package Options is
type Id is
(Output_Ada_Dict,
Dictionary_Input,
Decode,
Encode,
Evaluate,
Filter_Threshold,
Output_Hash,
Job_Count,
Help,
Sx_Dict_Output,
Min_Sub_Size,
Max_Sub_Size,
Stat_Output,
No_Stat_Output,
Text_List_Input,
Fast_Text_Input,
Max_Word_Size,
Sx_Output,
No_Sx_Output);
end Options;
package Getopt is new Natools.Getopt_Long (Options.Id);
|
110
111
112
113
114
115
116
117
118
119
120
121
122
123
|
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
Counts : out Natools.Smaz.Tools.Dictionary_Counts);
-- Dispatch to parallel or non-parallel version of Evaluate_Dictionary
-- depending on Job_Count.
function Getopt_Config return Getopt.Configuration;
-- Build the configuration object
procedure Optimization_Round
(Dict : in out Holders.Holder;
Score : in out Ada.Streams.Stream_Element_Count;
Counts : in out Natools.Smaz.Tools.Dictionary_Counts;
Pending_Words : in out Natools.Smaz.Tools.String_Lists.List;
Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
Job_Count : in Natural;
Updated : out Boolean);
-- Try to improve on Dict by replacing a single entry from it with
-- one of the substring in Pending_Words.
function Optimize_Dictionary
(Base : in Natools.Smaz.Dictionary;
Pending_Words : in Natools.Smaz.Tools.String_Lists.List;
Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
Job_Count : in Natural)
return Natools.Smaz.Dictionary;
-- Optimize the dictionary on Input_Texts, starting with Base and
-- adding substrings from Pending_Words.
procedure Parallel_Evaluate_Dictionary
(Job_Count : in Positive;
Dict : in Natools.Smaz.Dictionary;
Corpus : in Natools.Smaz.Tools.String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Natools.Smaz.Tools.Dictionary_Counts);
-- Return the same results as Natools.Smaz.Tools.Evaluate_Dictionary,
|
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
|
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
|
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
R.Add_Option ("jobs", 'j', Required_Argument, Job_Count);
R.Add_Option ("sx-dict", 'L', No_Argument, Sx_Dict_Output);
R.Add_Option ("min-substring", 'm', Required_Argument, Min_Sub_Size);
R.Add_Option ("max-substring", 'M', Required_Argument, Max_Sub_Size);
R.Add_Option ("stats", 's', No_Argument, Stat_Output);
R.Add_Option ("no-stats", 'S', No_Argument, No_Stat_Output);
R.Add_Option ("text-list", 't', No_Argument, Text_List_Input);
R.Add_Option ("fast-text-list", 'T', No_Argument, Fast_Text_Input);
R.Add_Option ("max-word-len", 'W', Required_Argument, Max_Word_Size);
R.Add_Option ("s-expr", 'x', No_Argument, Sx_Output);
R.Add_Option ("no-s-expr", 'X', No_Argument, No_Sx_Output);
return R;
end Getopt_Config;
procedure Optimization_Round
(Dict : in out Holders.Holder;
Score : in out Ada.Streams.Stream_Element_Count;
Counts : in out Natools.Smaz.Tools.Dictionary_Counts;
Pending_Words : in out Natools.Smaz.Tools.String_Lists.List;
Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
Job_Count : in Natural;
Updated : out Boolean)
is
pragma Unreferenced (Dict);
pragma Unreferenced (Score);
pragma Unreferenced (Counts);
pragma Unreferenced (Pending_Words);
pragma Unreferenced (Input_Texts);
pragma Unreferenced (Job_Count);
begin
Updated := False;
end Optimization_Round;
function Optimize_Dictionary
(Base : in Natools.Smaz.Dictionary;
Pending_Words : in Natools.Smaz.Tools.String_Lists.List;
Input_Texts : in Natools.Smaz.Tools.String_Lists.List;
Job_Count : in Natural)
return Natools.Smaz.Dictionary
is
Holder : Holders.Holder := Holders.To_Holder (Base);
Pending : Natools.Smaz.Tools.String_Lists.List := Pending_Words;
Score : Ada.Streams.Stream_Element_Count;
Counts : Natools.Smaz.Tools.Dictionary_Counts;
Running : Boolean := True;
begin
Evaluate_Dictionary (Job_Count, Base, Input_Texts, Score, Counts);
while Running loop
Optimization_Round
(Holder,
Score,
Counts,
Pending,
Input_Texts,
Job_Count,
Running);
end loop;
return Holder.Element;
end Optimize_Dictionary;
procedure Parallel_Evaluate_Dictionary
(Job_Count : in Positive;
Dict : in Natools.Smaz.Dictionary;
Corpus : in Natools.Smaz.Tools.String_Lists.List;
Compressed_Size : out Ada.Streams.Stream_Element_Count;
Counts : out Natools.Smaz.Tools.Dictionary_Counts)
|
491
492
493
494
495
496
497
498
499
500
501
502
503
504
|
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
|
+
+
+
+
+
+
|
when Options.Text_List_Input =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Compute dictionary from sample texts"
& " in input S-expression");
when Options.Fast_Text_Input =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Compute dictionary from sample texts"
& " in input S-expression, without optimization");
when Options.Sx_Dict_Output =>
New_Line (Output);
Put_Line (Output, Indent & Indent
& "Output the dictionary as a S-expression");
when Options.Min_Sub_Size =>
New_Line (Output);
|
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
|
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
|
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
+
+
+
+
|
Put_Line (Output, Indent & Indent
& "Before building a dictionary from substrings, remove");
Put_Line (Output, Indent & Indent
& "substrings whose count is below the threshold.");
end case;
end loop;
end Print_Help;
function To_Dictionary
(Handler : in Callback'Class;
Input : in Natools.Smaz.Tools.String_Lists.List)
return Natools.Smaz.Dictionary
is
use type Natools.Smaz.Tools.String_Count;
use type Dict_Sources.Enum;
begin
case Handler.Dict_Source is
when Dict_Sources.S_Expression =>
return Natools.Smaz.Tools.To_Dictionary (Input, True);
when Dict_Sources.Text_List =>
when Dict_Sources.Text_List | Dict_Sources.Unoptimized_Text_List =>
declare
Counter : Natools.Smaz.Tools.Word_Counter;
begin
for S of Input loop
Natools.Smaz.Tools.Add_Substrings
(Counter, S, Handler.Min_Sub_Size, Handler.Max_Sub_Size);
if Handler.Max_Word_Size > Handler.Max_Sub_Size then
Natools.Smaz.Tools.Add_Words
(Counter, S,
Handler.Max_Sub_Size + 1, Handler.Max_Word_Size);
end if;
end loop;
if Handler.Filter_Threshold > 0 then
Natools.Smaz.Tools.Filter_By_Count
(Counter, Handler.Filter_Threshold);
end if;
if Handler.Dict_Source = Dict_Sources.Text_List then
declare
Selected, Pending : Natools.Smaz.Tools.String_Lists.List;
begin
Natools.Smaz.Tools.Simple_Dictionary_And_Pending
(Counter, 254, Selected, Pending);
return Optimize_Dictionary
(Natools.Smaz.Tools.To_Dictionary (Selected, True),
Pending,
Input,
Handler.Job_Count);
end;
else
return Natools.Smaz.Tools.To_Dictionary
(Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
True);
return Natools.Smaz.Tools.To_Dictionary
(Natools.Smaz.Tools.Simple_Dictionary (Counter, 254),
True);
end if;
end;
end case;
end To_Dictionary;
Opt_Config : constant Getopt.Configuration := Getopt_Config;
Handler : Callback;
Input_List, Input_Data : Natools.Smaz.Tools.String_Lists.List;
|