aboutsummaryrefslogtreecommitdiffstats
path: root/src/com/jcraft/jzlib/Deflate.java
blob: 99788020b0043b6ea3cecc25a787a4953cb64ee0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
/* -*-mode:java; c-basic-offset:2; -*- */
/*
Copyright (c) 2000,2001,2002,2003 ymnk, JCraft,Inc. All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

  1. Redistributions of source code must retain the above copyright notice,
     this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright 
     notice, this list of conditions and the following disclaimer in 
     the documentation and/or other materials provided with the distribution.

  3. The names of the authors may not be used to endorse or promote products
     derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED WARRANTIES,
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JCRAFT,
INC. OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
/*
 * This program is based on zlib-1.1.3, so all credit should go authors
 * Jean-loup Gailly(jloup@gzip.org) and Mark Adler(madler@alumni.caltech.edu)
 * and contributors of zlib.
 */

package com.jcraft.jzlib;

public 
final class Deflate{

  static final private int MAX_MEM_LEVEL=9;

  static final private int Z_DEFAULT_COMPRESSION=-1;

  static final private int MAX_WBITS=15;            // 32K LZ77 window
  static final private int DEF_MEM_LEVEL=8;

  static class Config{
    int good_length; // reduce lazy search above this match length
    int max_lazy;    // do not perform lazy search above this match length
    int nice_length; // quit search above this match length
    int max_chain;
    int func;
    Config(int good_length, int max_lazy, 
	   int nice_length, int max_chain, int func){
      this.good_length=good_length;
      this.max_lazy=max_lazy;
      this.nice_length=nice_length;
      this.max_chain=max_chain;
      this.func=func;
    }
  }
  
  static final private int STORED=0;
  static final private int FAST=1;
  static final private int SLOW=2;
  static final private Config[] config_table;    
  static{
    config_table=new Config[10];
    //                         good  lazy  nice  chain
    config_table[0]=new Config(0,    0,    0,    0, STORED);
    config_table[1]=new Config(4,    4,    8,    4, FAST);
    config_table[2]=new Config(4,    5,   16,    8, FAST);
    config_table[3]=new Config(4,    6,   32,   32, FAST);

    config_table[4]=new Config(4,    4,   16,   16, SLOW);
    config_table[5]=new Config(8,   16,   32,   32, SLOW);
    config_table[6]=new Config(8,   16,  128,  128, SLOW);
    config_table[7]=new Config(8,   32,  128,  256, SLOW);
    config_table[8]=new Config(32, 128,  258, 1024, SLOW);
    config_table[9]=new Config(32, 258,  258, 4096, SLOW);
  }

  static final private String[] z_errmsg = {
    "need dictionary",     // Z_NEED_DICT       2
    "stream end",          // Z_STREAM_END      1
    "",                    // Z_OK              0
    "file error",          // Z_ERRNO         (-1)
    "stream error",        // Z_STREAM_ERROR  (-2)
    "data error",          // Z_DATA_ERROR    (-3)
    "insufficient memory", // Z_MEM_ERROR     (-4)
    "buffer error",        // Z_BUF_ERROR     (-5)
    "incompatible version",// Z_VERSION_ERROR (-6)
    ""
  };

  // block not completed, need more input or more output
  static final private int NeedMore=0; 

  // block flush performed
  static final private int BlockDone=1; 

  // finish started, need only more output at next deflate
  static final private int FinishStarted=2;

  // finish done, accept no more input or output
  static final private int FinishDone=3;

  // preset dictionary flag in zlib header
  static final private int PRESET_DICT=0x20;

  static final private int Z_FILTERED=1;
  static final private int Z_HUFFMAN_ONLY=2;
  static final private int Z_DEFAULT_STRATEGY=0;

  static final private int Z_NO_FLUSH=0;
  static final private int Z_PARTIAL_FLUSH=1;
  static final private int Z_SYNC_FLUSH=2;
  static final private int Z_FULL_FLUSH=3;
  static final private int Z_FINISH=4;

  static final private int Z_OK=0;
  static final private int Z_STREAM_END=1;
  static final private int Z_NEED_DICT=2;
  static final private int Z_ERRNO=-1;
  static final private int Z_STREAM_ERROR=-2;
  static final private int Z_DATA_ERROR=-3;
  static final private int Z_MEM_ERROR=-4;
  static final private int Z_BUF_ERROR=-5;
  static final private int Z_VERSION_ERROR=-6;

  static final private int INIT_STATE=42;
  static final private int BUSY_STATE=113;
  static final private int FINISH_STATE=666;

  // The deflate compression method
  static final private int Z_DEFLATED=8;

  static final private int STORED_BLOCK=0;
  static final private int STATIC_TREES=1;
  static final private int DYN_TREES=2;

  // The three kinds of block type
  static final private int Z_BINARY=0;
  static final private int Z_ASCII=1;
  static final private int Z_UNKNOWN=2;

  static final private int Buf_size=8*2;

  // repeat previous bit length 3-6 times (2 bits of repeat count)
  static final private int REP_3_6=16; 

  // repeat a zero length 3-10 times  (3 bits of repeat count)
  static final private int REPZ_3_10=17; 

  // repeat a zero length 11-138 times  (7 bits of repeat count)
  static final private int REPZ_11_138=18; 

  static final private int MIN_MATCH=3;
  static final private int MAX_MATCH=258;
  static final private int MIN_LOOKAHEAD=(MAX_MATCH+MIN_MATCH+1);

  static final private int MAX_BITS=15;
  static final private int D_CODES=30;
  static final private int BL_CODES=19;
  static final private int LENGTH_CODES=29;
  static final private int LITERALS=256;
  static final private int L_CODES=(LITERALS+1+LENGTH_CODES);
  static final private int HEAP_SIZE=(2*L_CODES+1);

  static final private int END_BLOCK=256;

  ZStream strm;         // pointer back to this zlib stream
  int status;           // as the name implies
  byte[] pending_buf;   // output still pending
  int pending_buf_size; // size of pending_buf
  int pending_out;      // next pending byte to output to the stream
  int pending;          // nb of bytes in the pending buffer
  int noheader;         // suppress zlib header and adler32
  byte data_type;       // UNKNOWN, BINARY or ASCII
  byte method;          // STORED (for zip only) or DEFLATED
  int last_flush;       // value of flush param for previous deflate call

  int w_size;           // LZ77 window size (32K by default)
  int w_bits;           // log2(w_size)  (8..16)
  int w_mask;           // w_size - 1

  byte[] window;
  // Sliding window. Input bytes are read into the second half of the window,
  // and move to the first half later to keep a dictionary of at least wSize
  // bytes. With this organization, matches are limited to a distance of
  // wSize-MAX_MATCH bytes, but this ensures that IO is always
  // performed with a length multiple of the block size. Also, it limits
  // the window size to 64K, which is quite useful on MSDOS.
  // To do: use the user input buffer as sliding window.

  int window_size;
  // Actual size of window: 2*wSize, except when the user input buffer
  // is directly used as sliding window.

  short[] prev;
  // Link to older string with same hash index. To limit the size of this
  // array to 64K, this link is maintained only for the last 32K strings.
  // An index in this array is thus a window index modulo 32K.

  short[] head; // Heads of the hash chains or NIL.

  int ins_h;          // hash index of string to be inserted
  int hash_size;      // number of elements in hash table
  int hash_bits;      // log2(hash_size)
  int hash_mask;      // hash_size-1

  // Number of bits by which ins_h must be shifted at each input
  // step. It must be such that after MIN_MATCH steps, the oldest
  // byte no longer takes part in the hash key, that is:
  // hash_shift * MIN_MATCH >= hash_bits
  int hash_shift;

  // Window position at the beginning of the current output block. Gets
  // negative when the window is moved backwards.

  int block_start;

  int match_length;           // length of best match
  int prev_match;             // previous match
  int match_available;        // set if previous match exists
  int strstart;               // start of string to insert
  int match_start;            // start of matching string
  int lookahead;              // number of valid bytes ahead in window

  // Length of the best match at previous step. Matches not greater than this
  // are discarded. This is used in the lazy match evaluation.
  int prev_length;

  // To speed up deflation, hash chains are never searched beyond this
  // length.  A higher limit improves compression ratio but degrades the speed.
  int max_chain_length;

  // Attempt to find a better match only when the current match is strictly
  // smaller than this value. This mechanism is used only for compression
  // levels >= 4.
  int max_lazy_match;

  // Insert new strings in the hash table only if the match length is not
  // greater than this length. This saves time but degrades compression.
  // max_insert_length is used only for compression levels <= 3.

  int level;    // compression level (1..9)
  int strategy; // favor or force Huffman coding

  // Use a faster search when the previous match is longer than this
  int good_match;

  // Stop searching when current match exceeds this
  int nice_match;

  short[] dyn_ltree;       // literal and length tree
  short[] dyn_dtree;       // distance tree
  short[] bl_tree;         // Huffman tree for bit lengths

  Tree l_desc=new Tree();  // desc for literal tree
  Tree d_desc=new Tree();  // desc for distance tree
  Tree bl_desc=new Tree(); // desc for bit length tree

  // number of codes at each bit length for an optimal tree
  short[] bl_count=new short[MAX_BITS+1];

  // heap used to build the Huffman trees
  int[] heap=new int[2*L_CODES+1];

  int heap_len;               // number of elements in the heap
  int heap_max;               // element of largest frequency
  // The sons of heap[n] are heap[2*n] and heap[2*n+1]. heap[0] is not used.
  // The same heap array is used to build all trees.

  // Depth of each subtree used as tie breaker for trees of equal frequency
  byte[] depth=new byte[2*L_CODES+1];

  int l_buf;               // index for literals or lengths */

  // Size of match buffer for literals/lengths.  There are 4 reasons for
  // limiting lit_bufsize to 64K:
  //   - frequencies can be kept in 16 bit counters
  //   - if compression is not successful for the first block, all input
  //     data is still in the window so we can still emit a stored block even
  //     when input comes from standard input.  (This can also be done for
  //     all blocks if lit_bufsize is not greater than 32K.)
  //   - if compression is not successful for a file smaller than 64K, we can
  //     even emit a stored file instead of a stored block (saving 5 bytes).
  //     This is applicable only for zip (not gzip or zlib).
  //   - creating new Huffman trees less frequently may not provide fast
  //     adaptation to changes in the input data statistics. (Take for
  //     example a binary file with poorly compressible code followed by
  //     a highly compressible string table.) Smaller buffer sizes give
  //     fast adaptation but have of course the overhead of transmitting
  //     trees more frequently.
  //   - I can't count above 4
  int lit_bufsize;

  int last_lit;      // running index in l_buf

  // Buffer for distances. To simplify the code, d_buf and l_buf have
  // the same number of elements. To use different lengths, an extra flag
  // array would be necessary.

  int d_buf;         // index of pendig_buf

  int opt_len;        // bit length of current block with optimal trees
  int static_len;     // bit length of current block with static trees
  int matches;        // number of string matches in current block
  int last_eob_len;   // bit length of EOB code for last block

  // Output buffer. bits are inserted starting at the bottom (least
  // significant bits).
  short bi_buf;

  // Number of valid bits in bi_buf.  All bits above the last valid bit
  // are always zero.
  int bi_valid;

  Deflate(){
    dyn_ltree=new short[HEAP_SIZE*2];
    dyn_dtree=new short[(2*D_CODES+1)*2]; // distance tree
    bl_tree=new short[(2*BL_CODES+1)*2];  // Huffman tree for bit lengths
  }

  void lm_init() {
    window_size=2*w_size;

    head[hash_size-1]=0;
    for(int i=0; i<hash_size-1; i++){
      head[i]=0;
    }

    // Set the default configuration parameters:
    max_lazy_match   = Deflate.config_table[level].max_lazy;
    good_match       = Deflate.config_table[level].good_length;
    nice_match       = Deflate.config_table[level].nice_length;
    max_chain_length = Deflate.config_table[level].max_chain;

    strstart = 0;
    block_start = 0;
    lookahead = 0;
    match_length = prev_length = MIN_MATCH-1;
    match_available = 0;
    ins_h = 0;
  }

  // Initialize the tree data structures for a new zlib stream.
  void tr_init(){

    l_desc.dyn_tree = dyn_ltree;
    l_desc.stat_desc = StaticTree.static_l_desc;

    d_desc.dyn_tree = dyn_dtree;
    d_desc.stat_desc = StaticTree.static_d_desc;

    bl_desc.dyn_tree = bl_tree;
    bl_desc.stat_desc = StaticTree.static_bl_desc;

    bi_buf = 0;
    bi_valid = 0;
    last_eob_len = 8; // enough lookahead for inflate

    // Initialize the first block of the first file:
    init_block();
  }

  void init_block(){
    // Initialize the trees.
    for(int i = 0; i < L_CODES; i++) dyn_ltree[i*2] = 0;
    for(int i= 0; i < D_CODES; i++) dyn_dtree[i*2] = 0;
    for(int i= 0; i < BL_CODES; i++) bl_tree[i*2] = 0;

    dyn_ltree[END_BLOCK*2] = 1;
    opt_len = static_len = 0;
    last_lit = matches = 0;
  }

  // Restore the heap property by moving down the tree starting at node k,
  // exchanging a node with the smallest of its two sons if necessary, stopping
  // when the heap property is re-established (each father smaller than its
  // two sons).
  void pqdownheap(short[] tree,  // the tree to restore
		  int k          // node to move down
		  ){
    int v = heap[k];
    int j = k << 1;  // left son of k
    while (j <= heap_len) {
      // Set j to the smallest of the two sons:
      if (j < heap_len &&
	  smaller(tree, heap[j+1], heap[j], depth)){
	j++;
      }
      // Exit if v is smaller than both sons
      if(smaller(tree, v, heap[j], depth)) break;

      // Exchange v with the smallest son
      heap[k]=heap[j];  k = j;
      // And continue down the tree, setting j to the left son of k
      j <<= 1;
    }
    heap[k] = v;
  }

  static boolean smaller(short[] tree, int n, int m, byte[] depth){
    short tn2=tree[n*2];
    short tm2=tree[m*2];
    return (tn2<tm2 ||
	    (tn2==tm2 && depth[n] <= depth[m]));
  }

  // Scan a literal or distance tree to determine the frequencies of the codes
  // in the bit length tree.
  void scan_tree (short[] tree,// the tree to be scanned
		  int max_code // and its largest code of non zero frequency
		  ){
    int n;                     // iterates over all tree elements
    int prevlen = -1;          // last emitted length
    int curlen;                // length of current code
    int nextlen = tree[0*2+1]; // length of next code
    int count = 0;             // repeat count of the current code
    int max_count = 7;         // max repeat count
    int min_count = 4;         // min repeat count

    if (nextlen == 0){ max_count = 138; min_count = 3; }
    tree[(max_code+1)*2+1] = (short)0xffff; // guard

    for(n = 0; n <= max_code; n++) {
      curlen = nextlen; nextlen = tree[(n+1)*2+1];
      if(++count < max_count && curlen == nextlen) {
	continue;
      }
      else if(count < min_count) {
	bl_tree[curlen*2] += count;
      }
      else if(curlen != 0) {
	if(curlen != prevlen) bl_tree[curlen*2]++;
	bl_tree[REP_3_6*2]++;
      }
      else if(count <= 10) {
	bl_tree[REPZ_3_10*2]++;
      }
      else{
	bl_tree[REPZ_11_138*2]++;
      }
      count = 0; prevlen = curlen;
      if(nextlen == 0) {
	max_count = 138; min_count = 3;
      }
      else if(curlen == nextlen) {
	max_count = 6; min_count = 3;
      }
      else{
	max_count = 7; min_count = 4;
      }
    }
  }

  // Construct the Huffman tree for the bit lengths and return the index in
  // bl_order of the last bit length code to send.
  int build_bl_tree(){
    int max_blindex;  // index of last bit length code of non zero freq

    // Determine the bit length frequencies for literal and distance trees
    scan_tree(dyn_ltree, l_desc.max_code);
    scan_tree(dyn_dtree, d_desc.max_code);

    // Build the bit length tree:
    bl_desc.build_tree(this);
    // opt_len now includes the length of the tree representations, except
    // the lengths of the bit lengths codes and the 5+5+4 bits for the counts.

    // Determine the number of bit length codes to send. The pkzip format
    // requires that at least 4 bit length codes be sent. (appnote.txt says
    // 3 but the actual value used is 4.)
    for (max_blindex = BL_CODES-1; max_blindex >= 3; max_blindex--) {
      if (bl_tree[Tree.bl_order[max_blindex]*2+1] != 0) break;
    }
    // Update opt_len to include the bit length tree and counts
    opt_len += 3*(max_blindex+1) + 5+5+4;

    return max_blindex;
  }


  // Send the header for a block using dynamic Huffman trees: the counts, the
  // lengths of the bit length codes, the literal tree and the distance tree.
  // IN assertion: lcodes >= 257, dcodes >= 1, blcodes >= 4.
  void send_all_trees(int lcodes, int dcodes, int blcodes){
    int rank;                    // index in bl_order

    send_bits(lcodes-257, 5); // not +255 as stated in appnote.txt
    send_bits(dcodes-1,   5);
    send_bits(blcodes-4,  4); // not -3 as stated in appnote.txt
    for (rank = 0; rank < blcodes; rank++) {
      send_bits(bl_tree[Tree.bl_order[rank]*2+1], 3);
    }
    send_tree(dyn_ltree, lcodes-1); // literal tree
    send_tree(dyn_dtree, dcodes-1); // distance tree
  }

  // Send a literal or distance tree in compressed form, using the codes in
  // bl_tree.
  void send_tree (short[] tree,// the tree to be sent
		  int max_code // and its largest code of non zero frequency
		  ){
    int n;                     // iterates over all tree elements
    int prevlen = -1;          // last emitted length
    int curlen;                // length of current code
    int nextlen = tree[0*2+1]; // length of next code
    int count = 0;             // repeat count of the current code
    int max_count = 7;         // max repeat count
    int min_count = 4;         // min repeat count

    if (nextlen == 0){ max_count = 138; min_count = 3; }

    for (n = 0; n <= max_code; n++) {
      curlen = nextlen; nextlen = tree[(n+1)*2+1];
      if(++count < max_count && curlen == nextlen) {
	continue;
      }
      else if(count < min_count) {
	do { send_code(curlen, bl_tree); } while (--count != 0);
      }
      else if(curlen != 0){
	if(curlen != prevlen){
	  send_code(curlen, bl_tree); count--;
	}
	send_code(REP_3_6, bl_tree); 
	send_bits(count-3, 2);
      }
      else if(count <= 10){
	send_code(REPZ_3_10, bl_tree); 
	send_bits(count-3, 3);
      }
      else{
	send_code(REPZ_11_138, bl_tree);
	send_bits(count-11, 7);
      }
      count = 0; prevlen = curlen;
      if(nextlen == 0){
	max_count = 138; min_count = 3;
      }
      else if(curlen == nextlen){
	max_count = 6; min_count = 3;
      }
      else{
	max_count = 7; min_count = 4;
      }
    }
  }

  // Output a byte on the stream.
  // IN assertion: there is enough room in pending_buf.
  final void put_byte(byte[] p, int start, int len){
    System.arraycopy(p, start, pending_buf, pending, len);
    pending+=len;
  }

  final void put_byte(byte c){
    pending_buf[pending++]=c;
  }
  final void put_short(int w) {
    put_byte((byte)(w/*&0xff*/));
    put_byte((byte)(w>>>8));
  }
  final void putShortMSB(int b){
    put_byte((byte)(b>>8));
    put_byte((byte)(b/*&0xff*/));
  }   

  final void send_code(int c, short[] tree){
    int c2=c*2;
    send_bits((tree[c2]&0xffff), (tree[c2+1]&0xffff));
  }

  void send_bits(int value, int length){
    int len = length;
    if (bi_valid > (int)Buf_size - len) {
      int val = value;
//      bi_buf |= (val << bi_valid);
      bi_buf |= ((val << bi_valid)&0xffff);
      put_short(bi_buf);
      bi_buf = (short)(val >>> (Buf_size - bi_valid));
      bi_valid += len - Buf_size;
    } else {
//      bi_buf |= (value) << bi_valid;
      bi_buf |= (((value) << bi_valid)&0xffff);
      bi_valid += len;
    }
  }

  // Send one empty static block to give enough lookahead for inflate.
  // This takes 10 bits, of which 7 may remain in the bit buffer.
  // The current inflate code requires 9 bits of lookahead. If the
  // last two codes for the previous block (real code plus EOB) were coded
  // on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode
  // the last real code. In this case we send two empty static blocks instead
  // of one. (There are no problems if the previous block is stored or fixed.)
  // To simplify the code, we assume the worst case of last real code encoded
  // on one bit only.
  void _tr_align(){
    send_bits(STATIC_TREES<<1, 3);
    send_code(END_BLOCK, StaticTree.static_ltree);

    bi_flush();

    // Of the 10 bits for the empty block, we have already sent
    // (10 - bi_valid) bits. The lookahead for the last real code (before
    // the EOB of the previous block) was thus at least one plus the length
    // of the EOB plus what we have just sent of the empty static block.
    if (1 + last_eob_len + 10 - bi_valid < 9) {
      send_bits(STATIC_TREES<<1, 3);
      send_code(END_BLOCK, StaticTree.static_ltree);
      bi_flush();
    }
    last_eob_len = 7;
  }


  // Save the match info and tally the frequency counts. Return true if
  // the current block must be flushed.
  boolean _tr_tally (int dist, // distance of matched string
		     int lc // match length-MIN_MATCH or unmatched char (if dist==0)
		     ){

    pending_buf[d_buf+last_lit*2] = (byte)(dist>>>8);
    pending_buf[d_buf+last_lit*2+1] = (byte)dist;

    pending_buf[l_buf+last_lit] = (byte)lc; last_lit++;

    if (dist == 0) {
      // lc is the unmatched char
      dyn_ltree[lc*2]++;
    } 
    else {
      matches++;
      // Here, lc is the match length - MIN_MATCH
      dist--;             // dist = match distance - 1
      dyn_ltree[(Tree._length_code[lc]+LITERALS+1)*2]++;
      dyn_dtree[Tree.d_code(dist)*2]++;
    }

    if ((last_lit & 0x1fff) == 0 && level > 2) {
      // Compute an upper bound for the compressed length
      int out_length = last_lit*8;
      int in_length = strstart - block_start;
      int dcode;
      for (dcode = 0; dcode < D_CODES; dcode++) {
	out_length += (int)dyn_dtree[dcode*2] *
	  (5L+Tree.extra_dbits[dcode]);
      }
      out_length >>>= 3;
      if ((matches < (last_lit/2)) && out_length < in_length/2) return true;
    }

    return (last_lit == lit_bufsize-1);
    // We avoid equality with lit_bufsize because of wraparound at 64K
    // on 16 bit machines and because stored blocks are restricted to
    // 64K-1 bytes.
  }

  // Send the block data compressed using the given Huffman trees
  void compress_block(short[] ltree, short[] dtree){
    int  dist;      // distance of matched string
    int lc;         // match length or unmatched char (if dist == 0)
    int lx = 0;     // running index in l_buf
    int code;       // the code to send
    int extra;      // number of extra bits to send

    if (last_lit != 0){
      do{
	dist=((pending_buf[d_buf+lx*2]<<8)&0xff00)|
	  (pending_buf[d_buf+lx*2+1]&0xff);
	lc=(pending_buf[l_buf+lx])&0xff; lx++;

	if(dist == 0){
	  send_code(lc, ltree); // send a literal byte
	} 
	else{
	  // Here, lc is the match length - MIN_MATCH
	  code = Tree._length_code[lc];

	  send_code(code+LITERALS+1, ltree); // send the length code
	  extra = Tree.extra_lbits[code];
	  if(extra != 0){
	    lc -= Tree.base_length[code];
	    send_bits(lc, extra);       // send the extra length bits
	  }
	  dist--; // dist is now the match distance - 1
	  code = Tree.d_code(dist);

	  send_code(code, dtree);       // send the distance code
	  extra = Tree.extra_dbits[code];
	  if (extra != 0) {
	    dist -= Tree.base_dist[code];
	    send_bits(dist, extra);   // send the extra distance bits
	  }
	} // literal or match pair ?

	// Check that the overlay between pending_buf and d_buf+l_buf is ok:
      }
      while (lx < last_lit);
    }

    send_code(END_BLOCK, ltree);
    last_eob_len = ltree[END_BLOCK*2+1];
  }

  // Set the data type to ASCII or BINARY, using a crude approximation:
  // binary if more than 20% of the bytes are <= 6 or >= 128, ascii otherwise.
  // IN assertion: the fields freq of dyn_ltree are set and the total of all
  // frequencies does not exceed 64K (to fit in an int on 16 bit machines).
  void set_data_type(){
    int n = 0;
    int  ascii_freq = 0;
    int  bin_freq = 0;
    while(n<7){ bin_freq += dyn_ltree[n*2]; n++;}
    while(n<128){ ascii_freq += dyn_ltree[n*2]; n++;}
    while(n<LITERALS){ bin_freq += dyn_ltree[n*2]; n++;}
    data_type=(byte)(bin_freq > (ascii_freq >>> 2) ? Z_BINARY : Z_ASCII);
  }

  // Flush the bit buffer, keeping at most 7 bits in it.
  void bi_flush(){
    if (bi_valid == 16) {
      put_short(bi_buf);
      bi_buf=0;
      bi_valid=0;
    }
    else if (bi_valid >= 8) {
      put_byte((byte)bi_buf);
      bi_buf>>>=8;
      bi_valid-=8;
    }
  }

  // Flush the bit buffer and align the output on a byte boundary
  void bi_windup(){
    if (bi_valid > 8) {
      put_short(bi_buf);
    } else if (bi_valid > 0) {
      put_byte((byte)bi_buf);
    }
    bi_buf = 0;
    bi_valid = 0;
  }

  // Copy a stored block, storing first the length and its
  // one's complement if requested.
  void copy_block(int buf,         // the input data
		  int len,         // its length
		  boolean header   // true if block header must be written
		  ){
    int index=0;
    bi_windup();      // align on byte boundary
    last_eob_len = 8; // enough lookahead for inflate

    if (header) {
      put_short((short)len);   
      put_short((short)~len);
    }

    //  while(len--!=0) {
    //    put_byte(window[buf+index]);
    //    index++;
    //  }
    put_byte(window, buf, len);
  }

  void flush_block_only(boolean eof){
    _tr_flush_block(block_start>=0 ? block_start : -1,
		    strstart-block_start,
		    eof);
    block_start=strstart;
    strm.flush_pending();
  }

  // Copy without compression as much as possible from the input stream, return
  // the current block state.
  // This function does not insert new strings in the dictionary since
  // uncompressible data is probably not useful. This function is used
  // only for the level=0 compression option.
  // NOTE: this function should be optimized to avoid extra copying from
  // window to pending_buf.
  int deflate_stored(int flush){
    // Stored blocks are limited to 0xffff bytes, pending_buf is limited
    // to pending_buf_size, and each stored block has a 5 byte header:

    int max_block_size = 0xffff;
    int max_start;

    if(max_block_size > pending_buf_size - 5) {
      max_block_size = pending_buf_size - 5;
    }

    // Copy as much as possible from input to output:
    while(true){
      // Fill the window as much as possible:
      if(lookahead<=1){
	fill_window();
	if(lookahead==0 && flush==Z_NO_FLUSH) return NeedMore;
	if(lookahead==0) break; // flush the current block
      }

      strstart+=lookahead;
      lookahead=0;

      // Emit a stored block if pending_buf will be full:
      max_start=block_start+max_block_size;
      if(strstart==0|| strstart>=max_start) {
	// strstart == 0 is possible when wraparound on 16-bit machine
	lookahead = (int)(strstart-max_start);
	strstart = (int)max_start;
      
	flush_block_only(false);
	if(strm.avail_out==0) return NeedMore;

      }

      // Flush if we may have to slide, otherwise block_start may become
      // negative and the data will be gone:
      if(strstart-block_start >= w_size-MIN_LOOKAHEAD) {
	flush_block_only(false);
	if(strm.avail_out==0) return NeedMore;
      }
    }

    flush_block_only(flush == Z_FINISH);
    if(strm.avail_out==0)
      return (flush == Z_FINISH) ? FinishStarted : NeedMore;

    return flush == Z_FINISH ? FinishDone : BlockDone;
  }

  // Send a stored block
  void _tr_stored_block(int buf,        // input block
			int stored_len, // length of input block
			boolean eof     // true if this is the last block for a file
			){
    send_bits((STORED_BLOCK<<1)+(eof?1:0), 3);  // send block type
    copy_block(buf, stored_len, true);          // with header
  }

  // Determine the best encoding for the current block: dynamic trees, static
  // trees or store, and output the encoded block to the zip file.
  void _tr_flush_block(int buf,        // input block, or NULL if too old
		       int stored_len, // length of input block
		       boolean eof     // true if this is the last block for a file
		       ) {
    int opt_lenb, static_lenb;// opt_len and static_len in bytes
    int max_blindex = 0;      // index of last bit length code of non zero freq

    // Build the Huffman trees unless a stored block is forced
    if(level > 0) {
      // Check if the file is ascii or binary
      if(data_type == Z_UNKNOWN) set_data_type();

      // Construct the literal and distance trees
      l_desc.build_tree(this);

      d_desc.build_tree(this);

      // At this point, opt_len and static_len are the total bit lengths of
      // the compressed block data, excluding the tree representations.

      // Build the bit length tree for the above two trees, and get the index
      // in bl_order of the last bit length code to send.
      max_blindex=build_bl_tree();

      // Determine the best encoding. Compute first the block length in bytes
      opt_lenb=(opt_len+3+7)>>>3;
      static_lenb=(static_len+3+7)>>>3;

      if(static_lenb<=opt_lenb) opt_lenb=static_lenb;
    }
    else {
      opt_lenb=static_lenb=stored_len+5; // force a stored block
    }

    if(stored_len+4<=opt_lenb && buf != -1){
      // 4: two words for the lengths
      // The test buf != NULL is only necessary if LIT_BUFSIZE > WSIZE.
      // Otherwise we can't have processed more than WSIZE input bytes since
      // the last block flush, because compression would have been
      // successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
      // transform a block into a stored block.
      _tr_stored_block(buf, stored_len, eof);
    }
    else if(static_lenb == opt_lenb){
      send_bits((STATIC_TREES<<1)+(eof?1:0), 3);
      compress_block(StaticTree.static_ltree, StaticTree.static_dtree);
    }
    else{
      send_bits((DYN_TREES<<1)+(eof?1:0), 3);
      send_all_trees(l_desc.max_code+1, d_desc.max_code+1, max_blindex+1);
      compress_block(dyn_ltree, dyn_dtree);
    }

    // The above check is made mod 2^32, for files larger than 512 MB
    // and uLong implemented on 32 bits.

    init_block();

    if(eof){
      bi_windup();
    }
  }

  // Fill the window when the lookahead becomes insufficient.
  // Updates strstart and lookahead.
  //
  // IN assertion: lookahead < MIN_LOOKAHEAD
  // OUT assertions: strstart <= window_size-MIN_LOOKAHEAD
  //    At least one byte has been read, or avail_in == 0; reads are
  //    performed for at least two bytes (required for the zip translate_eol
  //    option -- not supported here).
  void fill_window(){
    int n, m;
    int p;
    int more;    // Amount of free space at the end of the window.

    do{
      more = (window_size-lookahead-strstart);

      // Deal with !@#$% 64K limit:
      if(more==0 && strstart==0 && lookahead==0){
	more = w_size;
      } 
      else if(more==-1) {
	// Very unlikely, but possible on 16 bit machine if strstart == 0
	// and lookahead == 1 (input done one byte at time)
	more--;

	// If the window is almost full and there is insufficient lookahead,
	// move the upper half to the lower one to make room in the upper half.
      }
      else if(strstart >= w_size+ w_size-MIN_LOOKAHEAD) {
	System.arraycopy(window, w_size, window, 0, w_size);
	match_start-=w_size;
	strstart-=w_size; // we now have strstart >= MAX_DIST
	block_start-=w_size;

	// Slide the hash table (could be avoided with 32 bit values
	// at the expense of memory usage). We slide even when level == 0
	// to keep the hash table consistent if we switch back to level > 0
	// later. (Using level 0 permanently is not an optimal usage of
	// zlib, so we don't care about this pathological case.)

	n = hash_size;
	p=n;
	do {
	  m = (head[--p]&0xffff);
	  head[p]=(m>=w_size ? (short)(m-w_size) : 0);
	}
	while (--n != 0);

	n = w_size;
	p = n;
	do {
	  m = (prev[--p]&0xffff);
	  prev[p] = (m >= w_size ? (short)(m-w_size) : 0);
	  // If n is not on any hash chain, prev[n] is garbage but
	  // its value will never be used.
	}
	while (--n!=0);
	more += w_size;
      }

      if (strm.avail_in == 0) return;

      // If there was no sliding:
      //    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
      //    more == window_size - lookahead - strstart
      // => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
      // => more >= window_size - 2*WSIZE + 2
      // In the BIG_MEM or MMAP case (not yet supported),
      //   window_size == input_size + MIN_LOOKAHEAD  &&
      //   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
      // Otherwise, window_size == 2*WSIZE so more >= 2.
      // If there was sliding, more >= WSIZE. So in all cases, more >= 2.

      n = strm.read_buf(window, strstart + lookahead, more);
      lookahead += n;

      // Initialize the hash value now that we have some input:
      if(lookahead >= MIN_MATCH) {
	ins_h = window[strstart]&0xff;
	ins_h=(((ins_h)<<hash_shift)^(window[strstart+1]&0xff))&hash_mask;
      }
      // If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
      // but this is not important since only literal bytes will be emitted.
    }
    while (lookahead < MIN_LOOKAHEAD && strm.avail_in != 0);
  }

  // Compress as much as possible from the input stream, return the current
  // block state.
  // This function does not perform lazy evaluation of matches and inserts
  // new strings in the dictionary only for unmatched strings or for short
  // matches. It is used only for the fast compression options.
  int deflate_fast(int flush){
//    short hash_head = 0; // head of the hash chain
    int hash_head = 0; // head of the hash chain
    boolean bflush;      // set if current block must be flushed

    while(true){
      // Make sure that we always have enough lookahead, except
      // at the end of the input file. We need MAX_MATCH bytes
      // for the next match, plus MIN_MATCH bytes to insert the
      // string following the next match.
      if(lookahead < MIN_LOOKAHEAD){
	fill_window();
	if(lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH){
	  return NeedMore;
	}
	if(lookahead == 0) break; // flush the current block
      }

      // Insert the string window[strstart .. strstart+2] in the
      // dictionary, and set hash_head to the head of the hash chain:
      if(lookahead >= MIN_MATCH){
	ins_h=(((ins_h)<<hash_shift)^(window[(strstart)+(MIN_MATCH-1)]&0xff))&hash_mask;

//	prev[strstart&w_mask]=hash_head=head[ins_h];
        hash_head=(head[ins_h]&0xffff);
	prev[strstart&w_mask]=head[ins_h];
	head[ins_h]=(short)strstart;
      }

      // Find the longest match, discarding those <= prev_length.
      // At this point we have always match_length < MIN_MATCH

      if(hash_head!=0L && 
	 ((strstart-hash_head)&0xffff) <= w_size-MIN_LOOKAHEAD
	 ){
	// To simplify the code, we prevent matches with the string
	// of window index 0 (in particular we have to avoid a match
	// of the string with itself at the start of the input file).
	if(strategy != Z_HUFFMAN_ONLY){
	  match_length=longest_match (hash_head);
	}
	// longest_match() sets match_start
      }
      if(match_length>=MIN_MATCH){
	//        check_match(strstart, match_start, match_length);

	bflush=_tr_tally(strstart-match_start, match_length-MIN_MATCH);

	lookahead -= match_length;

	// Insert new strings in the hash table only if the match length
	// is not too large. This saves time but degrades compression.
	if(match_length <= max_lazy_match &&
	   lookahead >= MIN_MATCH) {
	  match_length--; // string at strstart already in hash table
	  do{
	    strstart++;

	    ins_h=((ins_h<<hash_shift)^(window[(strstart)+(MIN_MATCH-1)]&0xff))&hash_mask;
//	    prev[strstart&w_mask]=hash_head=head[ins_h];
	    hash_head=(head[ins_h]&0xffff);
	    prev[strstart&w_mask]=head[ins_h];
	    head[ins_h]=(short)strstart;

	    // strstart never exceeds WSIZE-MAX_MATCH, so there are
	    // always MIN_MATCH bytes ahead.
	  }
	  while (--match_length != 0);
	  strstart++; 
	}
	else{
	  strstart += match_length;
	  match_length = 0;
	  ins_h = window[strstart]&0xff;

	  ins_h=(((ins_h)<<hash_shift)^(window[strstart+1]&0xff))&hash_mask;
	  // If lookahead < MIN_MATCH, ins_h is garbage, but it does not
	  // matter since it will be recomputed at next deflate call.
	}
      }
      else {
	// No match, output a literal byte

	bflush=_tr_tally(0, window[strstart]&0xff);
	lookahead--;
	strstart++; 
      }
      if (bflush){

	flush_block_only(false);
	if(strm.avail_out==0) return NeedMore;
      }
    }

    flush_block_only(flush == Z_FINISH);
    if(strm.avail_out==0){
      if(flush == Z_FINISH) return FinishStarted;
      else return NeedMore;
    }
    return flush==Z_FINISH ? FinishDone : BlockDone;
  }

  // Same as above, but achieves better compression. We use a lazy
  // evaluation for matches: a match is finally adopted only if there is
  // no better match at the next window position.
  int deflate_slow(int flush){
//    short hash_head = 0;    // head of hash chain
    int hash_head = 0;    // head of hash chain
    boolean bflush;         // set if current block must be flushed

    // Process the input block.
    while(true){
      // Make sure that we always have enough lookahead, except
      // at the end of the input file. We need MAX_MATCH bytes
      // for the next match, plus MIN_MATCH bytes to insert the
      // string following the next match.

      if (lookahead < MIN_LOOKAHEAD) {
	fill_window();
	if(lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) {
	  return NeedMore;
	}
	if(lookahead == 0) break; // flush the current block
      }

      // Insert the string window[strstart .. strstart+2] in the
      // dictionary, and set hash_head to the head of the hash chain:

      if(lookahead >= MIN_MATCH) {
	ins_h=(((ins_h)<<hash_shift)^(window[(strstart)+(MIN_MATCH-1)]&0xff)) & hash_mask;
//	prev[strstart&w_mask]=hash_head=head[ins_h];
	hash_head=(head[ins_h]&0xffff);
	prev[strstart&w_mask]=head[ins_h];
	head[ins_h]=(short)strstart;
      }

      // Find the longest match, discarding those <= prev_length.
      prev_length = match_length; prev_match = match_start;
      match_length = MIN_MATCH-1;

      if (hash_head != 0 && prev_length < max_lazy_match &&
	  ((strstart-hash_head)&0xffff) <= w_size-MIN_LOOKAHEAD
	  ){
	// To simplify the code, we prevent matches with the string
	// of window index 0 (in particular we have to avoid a match
	// of the string with itself at the start of the input file).

	if(strategy != Z_HUFFMAN_ONLY) {
	  match_length = longest_match(hash_head);
	}
	// longest_match() sets match_start

	if (match_length <= 5 && (strategy == Z_FILTERED ||
				  (match_length == MIN_MATCH &&
				   strstart - match_start > 4096))) {

	  // If prev_match is also MIN_MATCH, match_start is garbage
	  // but we will ignore the current match anyway.
	  match_length = MIN_MATCH-1;
	}
      }

      // If there was a match at the previous step and the current
      // match is not better, output the previous match:
      if(prev_length >= MIN_MATCH && match_length <= prev_length) {
	int max_insert = strstart + lookahead - MIN_MATCH;
	// Do not insert strings in hash table beyond this.

	//          check_match(strstart-1, prev_match, prev_length);

	bflush=_tr_tally(strstart-1-prev_match, prev_length - MIN_MATCH);

	// Insert in hash table all strings up to the end of the match.
	// strstart-1 and strstart are already inserted. If there is not
	// enough lookahead, the last two strings are not inserted in
	// the hash table.
	lookahead -= prev_length-1;
	prev_length -= 2;
	do{
	  if(++strstart <= max_insert) {
	    ins_h=(((ins_h)<<hash_shift)^(window[(strstart)+(MIN_MATCH-1)]&0xff))&hash_mask;
	    //prev[strstart&w_mask]=hash_head=head[ins_h];
	    hash_head=(head[ins_h]&0xffff);
	    prev[strstart&w_mask]=head[ins_h];
	    head[ins_h]=(short)strstart;
	  }
	}
	while(--prev_length != 0);
	match_available = 0;
	match_length = MIN_MATCH-1;
	strstart++;

	if (bflush){
	  flush_block_only(false);
	  if(strm.avail_out==0) return NeedMore;
	}
      } else if (match_available!=0) {

	// If there was no match at the previous position, output a
	// single literal. If there was a match but the current match
	// is longer, truncate the previous match to a single literal.

	bflush=_tr_tally(0, window[strstart-1]&0xff);

	if (bflush) {
	  flush_block_only(false);
	}
	strstart++;
	lookahead--;
	if(strm.avail_out == 0) return NeedMore;
      } else {
	// There is no previous match to compare with, wait for
	// the next step to decide.

	match_available = 1;
	strstart++;
	lookahead--;
      }
    }

    if(match_available!=0) {
      bflush=_tr_tally(0, window[strstart-1]&0xff);
      match_available = 0;
    }
    flush_block_only(flush == Z_FINISH);

    if(strm.avail_out==0){
      if(flush == Z_FINISH) return FinishStarted;
      else return NeedMore;
    }

    return flush == Z_FINISH ? FinishDone : BlockDone;
  }

  int longest_match(int cur_match){
    int chain_length = max_chain_length; // max hash chain length
    int scan = strstart;                 // current string
    int match;                           // matched string
    int len;                             // length of current match
    int best_len = prev_length;          // best match length so far
    int limit = strstart>(w_size-MIN_LOOKAHEAD) ?
      strstart-(w_size-MIN_LOOKAHEAD) : 0;
    int nice_match=this.nice_match;

    // Stop when cur_match becomes <= limit. To simplify the code,
    // we prevent matches with the string of window index 0.

    int wmask = w_mask;

    int strend = strstart + MAX_MATCH;
    byte scan_end1 = window[scan+best_len-1];
    byte scan_end = window[scan+best_len];

    // The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
    // It is easy to get rid of this optimization if necessary.

    // Do not waste too much time if we already have a good match:
    if (prev_length >= good_match) {
      chain_length >>= 2;
    }

    // Do not look for matches beyond the end of the input. This is necessary
    // to make deflate deterministic.
    if (nice_match > lookahead) nice_match = lookahead;

    do {
      match = cur_match;

      // Skip to next match if the match length cannot increase
      // or if the match length is less than 2:
      if (window[match+best_len]   != scan_end  ||
	  window[match+best_len-1] != scan_end1 ||
	  window[match]       != window[scan]     ||
	  window[++match]     != window[scan+1])      continue;

      // The check at best_len-1 can be removed because it will be made
      // again later. (This heuristic is not always a win.)
      // It is not necessary to compare scan[2] and match[2] since they
      // are always equal when the other bytes match, given that
      // the hash keys are equal and that HASH_BITS >= 8.
      scan += 2; match++;

      // We check for insufficient lookahead only every 8th comparison;
      // the 256th check will be made at strstart+258.
      do {
      } while (window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       window[++scan] == window[++match] &&
	       scan < strend);

      len = MAX_MATCH - (int)(strend - scan);
      scan = strend - MAX_MATCH;

      if(len>best_len) {
	match_start = cur_match;
	best_len = len;
	if (len >= nice_match) break;
	scan_end1  = window[scan+best_len-1];
	scan_end   = window[scan+best_len];
      }

    } while ((cur_match = (prev[cur_match & wmask]&0xffff)) > limit
	     && --chain_length != 0);

    if (best_len <= lookahead) return best_len;
    return lookahead;
  }
    
  int deflateInit(ZStream strm, int level, int bits){
    return deflateInit2(strm, level, Z_DEFLATED, bits, DEF_MEM_LEVEL,
			Z_DEFAULT_STRATEGY);
  }
  int deflateInit(ZStream strm, int level){
    return deflateInit(strm, level, MAX_WBITS);
  }
  int deflateInit2(ZStream strm, int level, int method,  int windowBits,
		   int memLevel, int strategy){
    int noheader = 0;
    //    byte[] my_version=ZLIB_VERSION;

    //
    //  if (version == null || version[0] != my_version[0]
    //  || stream_size != sizeof(z_stream)) {
    //  return Z_VERSION_ERROR;
    //  }

    strm.msg = null;

    if (level == Z_DEFAULT_COMPRESSION) level = 6;

    if (windowBits < 0) { // undocumented feature: suppress zlib header
      noheader = 1;
      windowBits = -windowBits;
    }

    if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || 
	method != Z_DEFLATED ||
	windowBits < 9 || windowBits > 15 || level < 0 || level > 9 ||
        strategy < 0 || strategy > Z_HUFFMAN_ONLY) {
      return Z_STREAM_ERROR;
    }

    strm.dstate = (Deflate)this;

    this.noheader = noheader;
    w_bits = windowBits;
    w_size = 1 << w_bits;
    w_mask = w_size - 1;

    hash_bits = memLevel + 7;
    hash_size = 1 << hash_bits;
    hash_mask = hash_size - 1;
    hash_shift = ((hash_bits+MIN_MATCH-1)/MIN_MATCH);

    window = new byte[w_size*2];
    prev = new short[w_size];
    head = new short[hash_size];

    lit_bufsize = 1 << (memLevel + 6); // 16K elements by default

    // We overlay pending_buf and d_buf+l_buf. This works since the average
    // output size for (length,distance) codes is <= 24 bits.
    pending_buf = new byte[lit_bufsize*4];
    pending_buf_size = lit_bufsize*4;

    d_buf = lit_bufsize/2;
    l_buf = (1+2)*lit_bufsize;

    this.level = level;

//System.out.println("level="+level);

    this.strategy = strategy;
    this.method = (byte)method;

    return deflateReset(strm);
  }

  int deflateReset(ZStream strm){
    strm.total_in = strm.total_out = 0;
    strm.msg = null; //
    strm.data_type = Z_UNKNOWN;

    pending = 0;
    pending_out = 0;

    if(noheader < 0) {
      noheader = 0; // was set to -1 by deflate(..., Z_FINISH);
    }
    status = (noheader!=0) ? BUSY_STATE : INIT_STATE;
    strm.adler=strm._adler.adler32(0, null, 0, 0);

    last_flush = Z_NO_FLUSH;

    tr_init();
    lm_init();
    return Z_OK;
  }

  int deflateEnd(){
    if(status!=INIT_STATE && status!=BUSY_STATE && status!=FINISH_STATE){
      return Z_STREAM_ERROR;
    }
    // Deallocate in reverse order of allocations:
    pending_buf=null;
    head=null;
    prev=null;
    window=null;
    // free
    // dstate=null;
    return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK;
  }

  int deflateParams(ZStream strm, int _level, int _strategy){
    int err=Z_OK;

    if(_level == Z_DEFAULT_COMPRESSION){
      _level = 6;
    }
    if(_level < 0 || _level > 9 || 
       _strategy < 0 || _strategy > Z_HUFFMAN_ONLY) {
      return Z_STREAM_ERROR;
    }

    if(config_table[level].func!=config_table[_level].func &&
       strm.total_in != 0) {
      // Flush the last buffer:
      err = strm.deflate(Z_PARTIAL_FLUSH);
    }

    if(level != _level) {
      level = _level;
      max_lazy_match   = config_table[level].max_lazy;
      good_match       = config_table[level].good_length;
      nice_match       = config_table[level].nice_length;
      max_chain_length = config_table[level].max_chain;
    }
    strategy = _strategy;
    return err;
  }

  int deflateSetDictionary (ZStream strm, byte[] dictionary, int dictLength){
    int length = dictLength;
    int index=0;

    if(dictionary == null || status != INIT_STATE)
      return Z_STREAM_ERROR;

    strm.adler=strm._adler.adler32(strm.adler, dictionary, 0, dictLength);

    if(length < MIN_MATCH) return Z_OK;
    if(length > w_size-MIN_LOOKAHEAD){
      length = w_size-MIN_LOOKAHEAD;
      index=dictLength-length; // use the tail of the dictionary
    }
    System.arraycopy(dictionary, index, window, 0, length);
    strstart = length;
    block_start = length;

    // Insert all strings in the hash table (except for the last two bytes).
    // s->lookahead stays null, so s->ins_h will be recomputed at the next
    // call of fill_window.

    ins_h = window[0]&0xff;
    ins_h=(((ins_h)<<hash_shift)^(window[1]&0xff))&hash_mask;

    for(int n=0; n<=length-MIN_MATCH; n++){
      ins_h=(((ins_h)<<hash_shift)^(window[(n)+(MIN_MATCH-1)]&0xff))&hash_mask;
      prev[n&w_mask]=head[ins_h];
      head[ins_h]=(short)n;
    }
    return Z_OK;
  }

  int deflate(ZStream strm, int flush){
    int old_flush;

    if(flush>Z_FINISH || flush<0){
      return Z_STREAM_ERROR;
    }

    if(strm.next_out == null ||
       (strm.next_in == null && strm.avail_in != 0) ||
       (status == FINISH_STATE && flush != Z_FINISH)) {
      strm.msg=z_errmsg[Z_NEED_DICT-(Z_STREAM_ERROR)];
      return Z_STREAM_ERROR;
    }
    if(strm.avail_out == 0){
      strm.msg=z_errmsg[Z_NEED_DICT-(Z_BUF_ERROR)];
      return Z_BUF_ERROR;
    }

    this.strm = strm; // just in case
    old_flush = last_flush;
    last_flush = flush;

    // Write the zlib header
    if(status == INIT_STATE) {
      int header = (Z_DEFLATED+((w_bits-8)<<4))<<8;
      int level_flags=((level-1)&0xff)>>1;

      if(level_flags>3) level_flags=3;
      header |= (level_flags<<6);
      if(strstart!=0) header |= PRESET_DICT;
      header+=31-(header % 31);

      status=BUSY_STATE;
      putShortMSB(header);


      // Save the adler32 of the preset dictionary:
      if(strstart!=0){
        putShortMSB((int)(strm.adler>>>16));
        putShortMSB((int)(strm.adler&0xffff));
      }
      strm.adler=strm._adler.adler32(0, null, 0, 0);
    }

    // Flush as much pending output as possible
    if(pending != 0) {
      strm.flush_pending();
      if(strm.avail_out == 0) {
	//System.out.println("  avail_out==0");
	// Since avail_out is 0, deflate will be called again with
	// more output space, but possibly with both pending and
	// avail_in equal to zero. There won't be anything to do,
	// but this is not an error situation so make sure we
	// return OK instead of BUF_ERROR at next call of deflate:
	last_flush = -1;
	return Z_OK;
      }

      // Make sure there is something to do and avoid duplicate consecutive
      // flushes. For repeated and useless calls with Z_FINISH, we keep
      // returning Z_STREAM_END instead of Z_BUFF_ERROR.
    }
    else if(strm.avail_in==0 && flush <= old_flush &&
	    flush != Z_FINISH) {
      strm.msg=z_errmsg[Z_NEED_DICT-(Z_BUF_ERROR)];
      return Z_BUF_ERROR;
    }

    // User must not provide more input after the first FINISH:
    if(status == FINISH_STATE && strm.avail_in != 0) {
      strm.msg=z_errmsg[Z_NEED_DICT-(Z_BUF_ERROR)];
      return Z_BUF_ERROR;
    }

    // Start a new block or continue the current one.
    if(strm.avail_in!=0 || lookahead!=0 ||
       (flush != Z_NO_FLUSH && status != FINISH_STATE)) {
      int bstate=-1;
      switch(config_table[level].func){
      case STORED: 
	bstate = deflate_stored(flush);
	break;
      case FAST: 
	bstate = deflate_fast(flush);
	break;
      case SLOW: 
	bstate = deflate_slow(flush);
	break;
      default:
      }

      if (bstate==FinishStarted || bstate==FinishDone) {
	status = FINISH_STATE;
      }
      if (bstate==NeedMore || bstate==FinishStarted) {
	if(strm.avail_out == 0) {
	  last_flush = -1; // avoid BUF_ERROR next call, see above
	}
	return Z_OK;
	// If flush != Z_NO_FLUSH && avail_out == 0, the next call
	// of deflate should use the same flush parameter to make sure
	// that the flush is complete. So we don't have to output an
	// empty block here, this will be done at next call. This also
	// ensures that for a very small output buffer, we emit at most
	// one empty block.
      }

      if (bstate==BlockDone) {
	if(flush == Z_PARTIAL_FLUSH) {
	  _tr_align();
	} 
	else { // FULL_FLUSH or SYNC_FLUSH
	  _tr_stored_block(0, 0, false);
	  // For a full flush, this empty block will be recognized
	  // as a special marker by inflate_sync().
	  if(flush == Z_FULL_FLUSH) {
	    //state.head[s.hash_size-1]=0;
	    for(int i=0; i<hash_size/*-1*/; i++)  // forget history
	      head[i]=0;
	  }
	}
	strm.flush_pending();
	if(strm.avail_out == 0) {
	  last_flush = -1; // avoid BUF_ERROR at next call, see above
	  return Z_OK;
	}
      }
    }

    if(flush!=Z_FINISH) return Z_OK;
    if(noheader!=0) return Z_STREAM_END;

    // Write the zlib trailer (adler32)
    putShortMSB((int)(strm.adler>>>16));
    putShortMSB((int)(strm.adler&0xffff));
    strm.flush_pending();

    // If avail_out is zero, the application will call deflate again
    // to flush the rest.
    noheader = -1; // write the trailer only once!
    return pending != 0 ? Z_OK : Z_STREAM_END;
  }
}
pan>); show_execution_state(regs); panic("Assertion '%s' failed at %.50s:%d\n", predicate, filename, lineno); die: if ( (fixup = search_exception_table(regs->eip)) != 0 ) { regs->eip = fixup; return; } DEBUGGER_trap_fatal(TRAP_invalid_op, regs); show_execution_state(regs); panic("FATAL TRAP: vector = %d (invalid opcode)\n", TRAP_invalid_op); } asmlinkage void do_int3(struct cpu_user_regs *regs) { DEBUGGER_trap_entry(TRAP_int3, regs); if ( !guest_mode(regs) ) { debugger_trap_fatal(TRAP_int3, regs); return; } do_guest_trap(TRAP_int3, regs, 0); } asmlinkage void do_machine_check(struct cpu_user_regs *regs) { machine_check_vector(regs, regs->error_code); } static void reserved_bit_page_fault( unsigned long addr, struct cpu_user_regs *regs) { printk("d%d:v%d: reserved bit in page table (ec=%04X)\n", current->domain->domain_id, current->vcpu_id, regs->error_code); show_page_walk(addr); show_execution_state(regs); } void propagate_page_fault(unsigned long addr, u16 error_code) { struct trap_info *ti; struct vcpu *v = current; struct trap_bounce *tb = &v->arch.pv_vcpu.trap_bounce; v->arch.pv_vcpu.ctrlreg[2] = addr; arch_set_cr2(v, addr); /* Re-set error_code.user flag appropriately for the guest. */ error_code &= ~PFEC_user_mode; if ( !guest_kernel_mode(v, guest_cpu_user_regs()) ) error_code |= PFEC_user_mode; trace_pv_page_fault(addr, error_code); ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_page_fault]; tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; tb->error_code = error_code; tb->cs = ti->cs; tb->eip = ti->address; if ( TI_GET_IF(ti) ) tb->flags |= TBF_INTERRUPT; if ( unlikely(null_trap_bounce(v, tb)) ) { printk("d%d:v%d: unhandled page fault (ec=%04X)\n", v->domain->domain_id, v->vcpu_id, error_code); show_page_walk(addr); } if ( unlikely(error_code & PFEC_reserved_bit) ) reserved_bit_page_fault(addr, guest_cpu_user_regs()); } static int handle_gdt_ldt_mapping_fault( unsigned long offset, struct cpu_user_regs *regs) { struct vcpu *curr = current; /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */ unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1; unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT); /* * If the fault is in another vcpu's area, it cannot be due to * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and * indeed we have to since map_ldt_shadow_page() works correctly only on * accesses to a vcpu's own area. */ if ( vcpu_area != curr->vcpu_id ) return 0; /* Byte offset within the gdt/ldt sub-area. */ offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL; if ( likely(is_ldt_area) ) { /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */ if ( likely(map_ldt_shadow_page(offset >> PAGE_SHIFT)) ) { if ( guest_mode(regs) ) trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT, regs->eip, offset); } else { /* In hypervisor mode? Leave it to the #PF handler to fix up. */ if ( !guest_mode(regs) ) return 0; /* In guest mode? Propagate #PF to guest, with adjusted %cr2. */ propagate_page_fault( curr->arch.pv_vcpu.ldt_base + offset, regs->error_code); } } else { /* GDT fault: handle the fault as #GP(selector). */ regs->error_code = (u16)offset & ~7; (void)do_general_protection(regs); } return EXCRET_fault_fixed; } #ifdef HYPERVISOR_VIRT_END #define IN_HYPERVISOR_RANGE(va) \ (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END)) #else #define IN_HYPERVISOR_RANGE(va) \ (((va) >= HYPERVISOR_VIRT_START)) #endif enum pf_type { real_fault, smep_fault, spurious_fault }; static enum pf_type __page_fault_type( unsigned long addr, unsigned int error_code) { unsigned long mfn, cr3 = read_cr3(); #if CONFIG_PAGING_LEVELS >= 4 l4_pgentry_t l4e, *l4t; #endif #if CONFIG_PAGING_LEVELS >= 3 l3_pgentry_t l3e, *l3t; #endif l2_pgentry_t l2e, *l2t; l1_pgentry_t l1e, *l1t; unsigned int required_flags, disallowed_flags, page_user; /* * We do not take spurious page faults in IRQ handlers as we do not * modify page tables in IRQ context. We therefore bail here because * map_domain_page() is not IRQ-safe. */ if ( in_irq() ) return real_fault; /* Reserved bit violations are never spurious faults. */ if ( error_code & PFEC_reserved_bit ) return real_fault; required_flags = _PAGE_PRESENT; if ( error_code & PFEC_write_access ) required_flags |= _PAGE_RW; if ( error_code & PFEC_user_mode ) required_flags |= _PAGE_USER; disallowed_flags = 0; if ( error_code & PFEC_insn_fetch ) disallowed_flags |= _PAGE_NX_BIT; page_user = _PAGE_USER; mfn = cr3 >> PAGE_SHIFT; #if CONFIG_PAGING_LEVELS >= 4 l4t = map_domain_page(mfn); l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]); mfn = l4e_get_pfn(l4e); unmap_domain_page(l4t); if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) || (l4e_get_flags(l4e) & disallowed_flags) ) return real_fault; page_user &= l4e_get_flags(l4e); #endif #if CONFIG_PAGING_LEVELS >= 3 l3t = map_domain_page(mfn); #if CONFIG_PAGING_LEVELS == 3 l3t += (cr3 & 0xFE0UL) >> 3; #endif l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]); mfn = l3e_get_pfn(l3e); unmap_domain_page(l3t); #if CONFIG_PAGING_LEVELS == 3 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) return real_fault; #else if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) || (l3e_get_flags(l3e) & disallowed_flags) ) return real_fault; page_user &= l3e_get_flags(l3e); if ( l3e_get_flags(l3e) & _PAGE_PSE ) goto leaf; #endif #endif l2t = map_domain_page(mfn); l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]); mfn = l2e_get_pfn(l2e); unmap_domain_page(l2t); if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) || (l2e_get_flags(l2e) & disallowed_flags) ) return real_fault; page_user &= l2e_get_flags(l2e); if ( l2e_get_flags(l2e) & _PAGE_PSE ) goto leaf; l1t = map_domain_page(mfn); l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]); mfn = l1e_get_pfn(l1e); unmap_domain_page(l1t); if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) || (l1e_get_flags(l1e) & disallowed_flags) ) return real_fault; page_user &= l1e_get_flags(l1e); leaf: /* * Supervisor Mode Execution Protection (SMEP): * Disallow supervisor execution from user-accessible mappings */ if ( (read_cr4() & X86_CR4_SMEP) && page_user && ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) ) return smep_fault; return spurious_fault; } static enum pf_type spurious_page_fault( unsigned long addr, unsigned int error_code) { unsigned long flags; enum pf_type pf_type; /* * Disabling interrupts prevents TLB flushing, and hence prevents * page tables from becoming invalid under our feet during the walk. */ local_irq_save(flags); pf_type = __page_fault_type(addr, error_code); local_irq_restore(flags); return pf_type; } static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) { struct vcpu *v = current; struct domain *d = v->domain; /* No fixups in interrupt context or when interrupts are disabled. */ if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) ) return 0; /* Faults from external-mode guests are handled by shadow/hap */ if ( paging_mode_external(d) && guest_mode(regs) ) { int ret = paging_fault(addr, regs); if ( ret == EXCRET_fault_fixed ) trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr); return ret; } if ( !(regs->error_code & PFEC_page_present) && (pagefault_by_memadd(addr, regs)) ) return handle_memadd_fault(addr, regs); if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) && (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); return 0; } if ( VM_ASSIST(d, VMASST_TYPE_writable_pagetables) && guest_kernel_mode(v, regs) ) { unsigned int mbs = PFEC_write_access; unsigned int mbz = PFEC_reserved_bit | PFEC_insn_fetch; /* Do not check if access-protection fault since the page may legitimately be not present in shadow page tables */ if ( !paging_mode_enabled(d) ) mbs |= PFEC_page_present; if ( ((regs->error_code & (mbs | mbz)) == mbs) && ptwr_do_page_fault(v, addr, regs) ) return EXCRET_fault_fixed; } /* For non-external shadowed guests, we fix up both their own * pagefaults and Xen's, since they share the pagetables. */ if ( paging_mode_enabled(d) && !paging_mode_external(d) ) { int ret = paging_fault(addr, regs); if ( ret == EXCRET_fault_fixed ) trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->eip, addr); return ret; } return 0; } /* * #PF error code: * Bit 0: Protection violation (=1) ; Page not present (=0) * Bit 1: Write access * Bit 2: User mode (=1) ; Supervisor mode (=0) * Bit 3: Reserved bit violation * Bit 4: Instruction fetch */ asmlinkage void do_page_fault(struct cpu_user_regs *regs) { unsigned long addr, fixup; unsigned int error_code; enum pf_type pf_type; addr = read_cr2(); /* fixup_page_fault() might change regs->error_code, so cache it here. */ error_code = regs->error_code; DEBUGGER_trap_entry(TRAP_page_fault, regs); perfc_incr(page_faults); if ( unlikely(fixup_page_fault(addr, regs) != 0) ) return; if ( unlikely(!guest_mode(regs)) ) { pf_type = spurious_page_fault(addr, error_code); BUG_ON(pf_type == smep_fault); if ( pf_type != real_fault ) return; if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { perfc_incr(copy_user_faults); if ( unlikely(regs->error_code & PFEC_reserved_bit) ) reserved_bit_page_fault(addr, regs); regs->eip = fixup; return; } DEBUGGER_trap_fatal(TRAP_page_fault, regs); show_execution_state(regs); show_page_walk(addr); panic("FATAL PAGE FAULT\n" "[error_code=%04x]\n" "Faulting linear address: %p\n", error_code, _p(addr)); } if ( unlikely(current->domain->arch.suppress_spurious_page_faults) ) { pf_type = spurious_page_fault(addr, error_code); if ( pf_type == smep_fault ) { gdprintk(XENLOG_ERR, "Fatal SMEP fault\n"); domain_crash(current->domain); } if ( pf_type != real_fault ) return; } propagate_page_fault(addr, regs->error_code); } /* * Early #PF handler to print CR2, error code, and stack. * * We also deal with spurious faults here, even though they should never happen * during early boot (an issue was seen once, but was most likely a hardware * problem). */ asmlinkage void __init do_early_page_fault(struct cpu_user_regs *regs) { static int stuck; static unsigned long prev_eip, prev_cr2; unsigned long cr2 = read_cr2(); BUG_ON(smp_processor_id() != 0); if ( (regs->eip != prev_eip) || (cr2 != prev_cr2) ) { prev_eip = regs->eip; prev_cr2 = cr2; stuck = 0; return; } if ( stuck++ == 1000 ) { unsigned long *stk = (unsigned long *)regs; printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n", regs->cs, _p(regs->eip), _p(cr2), regs->error_code); show_page_walk(cr2); printk("Stack dump: "); while ( ((long)stk & ((PAGE_SIZE - 1) & ~(BYTES_PER_LONG - 1))) != 0 ) printk("%p ", _p(*stk++)); for ( ; ; ) ; } } long do_fpu_taskswitch(int set) { struct vcpu *v = current; if ( set ) { v->arch.pv_vcpu.ctrlreg[0] |= X86_CR0_TS; stts(); } else { v->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS; if ( v->fpu_dirtied ) clts(); } return 0; } static int read_descriptor(unsigned int sel, const struct vcpu *v, const struct cpu_user_regs * regs, unsigned long *base, unsigned long *limit, unsigned int *ar, unsigned int vm86attr) { struct desc_struct desc; if ( !vm86_mode(regs) ) { if ( sel < 4) desc.b = desc.a = 0; else if ( __get_user(desc, (const struct desc_struct *)(!(sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v)) + (sel >> 3)) ) return 0; if ( !(vm86attr & _SEGMENT_CODE) ) desc.b &= ~_SEGMENT_L; } else { desc.a = (sel << 20) | 0xffff; desc.b = vm86attr | (sel >> 12); } *ar = desc.b & 0x00f0ff00; if ( !(desc.b & _SEGMENT_L) ) { *base = ((desc.a >> 16) + ((desc.b & 0xff) << 16) + (desc.b & 0xff000000)); *limit = (desc.a & 0xffff) | (desc.b & 0x000f0000); if ( desc.b & _SEGMENT_G ) *limit = ((*limit + 1) << 12) - 1; #ifndef NDEBUG if ( !vm86_mode(regs) && (sel > 3) ) { unsigned int a, l; unsigned char valid; asm volatile ( "larl %2,%0 ; setz %1" : "=r" (a), "=qm" (valid) : "rm" (sel)); BUG_ON(valid && ((a & 0x00f0ff00) != *ar)); asm volatile ( "lsll %2,%0 ; setz %1" : "=r" (l), "=qm" (valid) : "rm" (sel)); BUG_ON(valid && (l != *limit)); } #endif } else { *base = 0UL; *limit = ~0UL; } return 1; } #ifdef __x86_64__ static int read_gate_descriptor(unsigned int gate_sel, const struct vcpu *v, unsigned int *sel, unsigned long *off, unsigned int *ar) { struct desc_struct desc; const struct desc_struct *pdesc; pdesc = (const struct desc_struct *) (!(gate_sel & 4) ? GDT_VIRT_START(v) : LDT_VIRT_START(v)) + (gate_sel >> 3); if ( (gate_sel < 4) || ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) || __get_user(desc, pdesc) ) return 0; *sel = (desc.a >> 16) & 0x0000fffc; *off = (desc.a & 0x0000ffff) | (desc.b & 0xffff0000); *ar = desc.b & 0x0000ffff; /* * check_descriptor() clears the DPL field and stores the * guest requested DPL in the selector's RPL field. */ if ( *ar & _SEGMENT_DPL ) return 0; *ar |= (desc.a >> (16 - 13)) & _SEGMENT_DPL; if ( !is_pv_32bit_vcpu(v) ) { if ( (*ar & 0x1f00) != 0x0c00 || (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) || __get_user(desc, pdesc + 1) || (desc.b & 0x1f00) ) return 0; *off |= (unsigned long)desc.a << 32; return 1; } switch ( *ar & 0x1f00 ) { case 0x0400: *off &= 0xffff; break; case 0x0c00: break; default: return 0; } return 1; } #endif /* Has the guest requested sufficient permission for this I/O access? */ static int guest_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { #if defined(__x86_64__) /* If in user mode, switch to kernel mode just to read I/O bitmap. */ int user_mode = !(v->arch.flags & TF_kernel_mode); #define TOGGLE_MODE() if ( user_mode ) toggle_guest_mode(v) #elif defined(__i386__) #define TOGGLE_MODE() ((void)0) #endif if ( !vm86_mode(regs) && (v->arch.pv_vcpu.iopl >= (guest_kernel_mode(v, regs) ? 1 : 3)) ) return 1; if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) ) { union { uint8_t bytes[2]; uint16_t mask; } x; /* * Grab permission bytes from guest space. Inaccessible bytes are * read as 0xff (no access allowed). */ TOGGLE_MODE(); switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp, port>>3, 2) ) { default: x.bytes[0] = ~0; case 1: x.bytes[1] = ~0; case 0: break; } TOGGLE_MODE(); if ( (x.mask & (((1<<bytes)-1) << (port&7))) == 0 ) return 1; } return 0; } /* Has the administrator granted sufficient permission for this I/O access? */ static int admin_io_okay( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { /* * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses. * We never permit direct access to that register. */ if ( (port == 0xcf8) && (bytes == 4) ) return 0; return ioports_access_permitted(v->domain, port, port + bytes - 1); } static uint32_t guest_io_read( unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { uint32_t data = 0; unsigned int shift = 0; if ( admin_io_okay(port, bytes, v, regs) ) { switch ( bytes ) { case 1: return inb(port); case 2: return inw(port); case 4: return inl(port); } } while ( bytes != 0 ) { unsigned int size = 1; uint32_t sub_data = 0xff; if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) { sub_data = pv_pit_handler(port, 0, 0); } else if ( (port == 0xcf8) && (bytes == 4) ) { size = 4; sub_data = v->domain->arch.pci_cf8; } else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) ) { size = min(bytes, 4 - (port & 3)); if ( size == 3 ) size = 2; sub_data = pci_conf_read(v->domain->arch.pci_cf8, port & 3, size); } if ( size == 4 ) return sub_data; data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; shift += size * 8; port += size; bytes -= size; } return data; } static void guest_io_write( unsigned int port, unsigned int bytes, uint32_t data, struct vcpu *v, struct cpu_user_regs *regs) { if ( admin_io_okay(port, bytes, v, regs) ) { switch ( bytes ) { case 1: if ( ((port == 0x70) || (port == 0x71)) && pv_rtc_handler ) pv_rtc_handler(port, (uint8_t)data); outb((uint8_t)data, port); if ( pv_post_outb_hook ) pv_post_outb_hook(port, (uint8_t)data); break; case 2: outw((uint16_t)data, port); break; case 4: outl(data, port); break; } return; } while ( bytes != 0 ) { unsigned int size = 1; if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) { pv_pit_handler(port, (uint8_t)data, 1); } else if ( (port == 0xcf8) && (bytes == 4) ) { size = 4; v->domain->arch.pci_cf8 = data; } else if ( ((port & 0xfffc) == 0xcfc) && IS_PRIV(v->domain) ) { size = min(bytes, 4 - (port & 3)); if ( size == 3 ) size = 2; pci_conf_write(v->domain->arch.pci_cf8, port & 3, size, data); } if ( size == 4 ) return; port += size; bytes -= size; data >>= size * 8; } } /* I/O emulation support. Helper routines for, and type of, the stack stub.*/ void host_to_guest_gpr_switch(struct cpu_user_regs *) __attribute__((__regparm__(1))); unsigned long guest_to_host_gpr_switch(unsigned long) __attribute__((__regparm__(1))); void (*pv_post_outb_hook)(unsigned int port, u8 value); static inline uint64_t guest_misc_enable(uint64_t val) { val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | MSR_IA32_MISC_ENABLE_MONITOR_ENABLE); val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_XTPR_DISABLE; return val; } /* Instruction fetch with error handling. */ #define insn_fetch(type, base, eip, limit) \ ({ unsigned long _rc, _ptr = (base) + (eip); \ type _x; \ if ( ad_default < 8 ) \ _ptr = (unsigned int)_ptr; \ if ( (limit) < sizeof(_x) - 1 || (eip) > (limit) - (sizeof(_x) - 1) ) \ goto fail; \ if ( (_rc = copy_from_user(&_x, (type *)_ptr, sizeof(_x))) != 0 ) \ { \ propagate_page_fault(_ptr + sizeof(_x) - _rc, 0); \ goto skip; \ } \ (eip) += sizeof(_x); _x; }) #if defined(CONFIG_X86_32) # define read_sreg(regs, sr) ((regs)->sr) #elif defined(CONFIG_X86_64) # define read_sreg(regs, sr) read_segment_register(sr) #endif static int is_cpufreq_controller(struct domain *d) { return ((cpufreq_controller == FREQCTL_dom0_kernel) && (d->domain_id == 0)); } #ifdef CONFIG_X86_64 #include "x86_64/mmconfig.h" #endif static int emulate_privileged_op(struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long *reg, eip = regs->eip; u8 opcode, modrm_reg = 0, modrm_rm = 0, rep_prefix = 0, lock = 0, rex = 0; enum { lm_seg_none, lm_seg_fs, lm_seg_gs } lm_ovr = lm_seg_none; int rc; unsigned int port, i, data_sel, ar, data, bpmatch = 0; unsigned int op_bytes, op_default, ad_bytes, ad_default, opsize_prefix= 0; #define rd_ad(reg) (ad_bytes >= sizeof(regs->reg) \ ? regs->reg \ : ad_bytes == 4 \ ? (u32)regs->reg \ : (u16)regs->reg) #define wr_ad(reg, val) (ad_bytes >= sizeof(regs->reg) \ ? regs->reg = (val) \ : ad_bytes == 4 \ ? (*(u32 *)&regs->reg = (val)) \ : (*(u16 *)&regs->reg = (val))) unsigned long code_base, code_limit; char io_emul_stub[32]; void (*io_emul)(struct cpu_user_regs *) __attribute__((__regparm__(1))); uint64_t val, msr_content; if ( !read_descriptor(regs->cs, v, regs, &code_base, &code_limit, &ar, _SEGMENT_CODE|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P) ) goto fail; op_default = op_bytes = (ar & (_SEGMENT_L|_SEGMENT_DB)) ? 4 : 2; ad_default = ad_bytes = (ar & _SEGMENT_L) ? 8 : op_default; if ( !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || !(ar & _SEGMENT_CODE) ) goto fail; /* emulating only opcodes not allowing SS to be default */ data_sel = read_sreg(regs, ds); /* Legacy prefixes. */ for ( i = 0; i < 8; i++, rex == opcode || (rex = 0) ) { switch ( opcode = insn_fetch(u8, code_base, eip, code_limit) ) { case 0x66: /* operand-size override */ opsize_prefix = 1; op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ continue; case 0x67: /* address-size override */ ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */ continue; case 0x2e: /* CS override */ data_sel = regs->cs; continue; case 0x3e: /* DS override */ data_sel = read_sreg(regs, ds); continue; case 0x26: /* ES override */ data_sel = read_sreg(regs, es); continue; case 0x64: /* FS override */ data_sel = read_sreg(regs, fs); lm_ovr = lm_seg_fs; continue; case 0x65: /* GS override */ data_sel = read_sreg(regs, gs); lm_ovr = lm_seg_gs; continue; case 0x36: /* SS override */ data_sel = regs->ss; continue; case 0xf0: /* LOCK */ lock = 1; continue; case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ rep_prefix = 1; continue; default: if ( (ar & _SEGMENT_L) && (opcode & 0xf0) == 0x40 ) { rex = opcode; continue; } break; } break; } /* REX prefix. */ if ( rex & 8 ) /* REX.W */ op_bytes = 4; /* emulate only opcodes not supporting 64-bit operands */ modrm_reg = (rex & 4) << 1; /* REX.R */ /* REX.X does not need to be decoded. */ modrm_rm = (rex & 1) << 3; /* REX.B */ if ( opcode == 0x0f ) goto twobyte_opcode; if ( lock ) goto fail; /* Input/Output String instructions. */ if ( (opcode >= 0x6c) && (opcode <= 0x6f) ) { unsigned long data_base, data_limit; if ( rep_prefix && (rd_ad(ecx) == 0) ) goto done; if ( !(opcode & 2) ) { data_sel = read_sreg(regs, es); lm_ovr = lm_seg_none; } if ( !(ar & _SEGMENT_L) ) { if ( !read_descriptor(data_sel, v, regs, &data_base, &data_limit, &ar, _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL| _SEGMENT_P) ) goto fail; if ( !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || (opcode & 2 ? (ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR) : (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR)) ) goto fail; } #ifdef CONFIG_X86_64 else { if ( lm_ovr == lm_seg_none || data_sel < 4 ) { switch ( lm_ovr ) { case lm_seg_none: data_base = 0UL; break; case lm_seg_fs: data_base = v->arch.pv_vcpu.fs_base; break; case lm_seg_gs: if ( guest_kernel_mode(v, regs) ) data_base = v->arch.pv_vcpu.gs_base_kernel; else data_base = v->arch.pv_vcpu.gs_base_user; break; } } else read_descriptor(data_sel, v, regs, &data_base, &data_limit, &ar, 0); data_limit = ~0UL; ar = _SEGMENT_WR|_SEGMENT_S|_SEGMENT_DPL|_SEGMENT_P; } #endif port = (u16)regs->edx; continue_io_string: switch ( opcode ) { case 0x6c: /* INSB */ op_bytes = 1; case 0x6d: /* INSW/INSL */ if ( (data_limit < (op_bytes - 1)) || (rd_ad(edi) > (data_limit - (op_bytes - 1))) || !guest_io_okay(port, op_bytes, v, regs) ) goto fail; data = guest_io_read(port, op_bytes, v, regs); if ( (rc = copy_to_user((void *)data_base + rd_ad(edi), &data, op_bytes)) != 0 ) { propagate_page_fault(data_base + rd_ad(edi) + op_bytes - rc, PFEC_write_access); return EXCRET_fault_fixed; } wr_ad(edi, regs->edi + (int)((regs->eflags & X86_EFLAGS_DF) ? -op_bytes : op_bytes)); break; case 0x6e: /* OUTSB */ op_bytes = 1; case 0x6f: /* OUTSW/OUTSL */ if ( (data_limit < (op_bytes - 1)) || (rd_ad(esi) > (data_limit - (op_bytes - 1))) || !guest_io_okay(port, op_bytes, v, regs) ) goto fail; if ( (rc = copy_from_user(&data, (void *)data_base + rd_ad(esi), op_bytes)) != 0 ) { propagate_page_fault(data_base + rd_ad(esi) + op_bytes - rc, 0); return EXCRET_fault_fixed; } guest_io_write(port, op_bytes, data, v, regs); wr_ad(esi, regs->esi + (int)((regs->eflags & X86_EFLAGS_DF) ? -op_bytes : op_bytes)); break; } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); if ( rep_prefix && (wr_ad(ecx, regs->ecx - 1) != 0) ) { if ( !bpmatch && !hypercall_preempt_check() ) goto continue_io_string; eip = regs->eip; } goto done; } /* * Very likely to be an I/O instruction (IN/OUT). * Build an on-stack stub to execute the instruction with full guest * GPR context. This is needed for some systems which (ab)use IN/OUT * to communicate with BIOS code in system-management mode. */ #ifdef __x86_64__ /* movq $host_to_guest_gpr_switch,%rcx */ io_emul_stub[0] = 0x48; io_emul_stub[1] = 0xb9; *(void **)&io_emul_stub[2] = (void *)host_to_guest_gpr_switch; /* callq *%rcx */ io_emul_stub[10] = 0xff; io_emul_stub[11] = 0xd1; #else /* call host_to_guest_gpr_switch */ io_emul_stub[0] = 0xe8; *(s32 *)&io_emul_stub[1] = (char *)host_to_guest_gpr_switch - &io_emul_stub[5]; /* 7 x nop */ memset(&io_emul_stub[5], 0x90, 7); #endif /* data16 or nop */ io_emul_stub[12] = (op_bytes != 2) ? 0x90 : 0x66; /* <io-access opcode> */ io_emul_stub[13] = opcode; /* imm8 or nop */ io_emul_stub[14] = 0x90; /* ret (jumps to guest_to_host_gpr_switch) */ io_emul_stub[15] = 0xc3; /* Handy function-typed pointer to the stub. */ io_emul = (void *)io_emul_stub; if ( ioemul_handle_quirk ) ioemul_handle_quirk(opcode, &io_emul_stub[12], regs); /* I/O Port and Interrupt Flag instructions. */ switch ( opcode ) { case 0xe4: /* IN imm8,%al */ op_bytes = 1; case 0xe5: /* IN imm8,%eax */ port = insn_fetch(u8, code_base, eip, code_limit); io_emul_stub[14] = port; /* imm8 */ exec_in: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; if ( admin_io_okay(port, op_bytes, v, regs) ) { io_emul(regs); } else { if ( op_bytes == 4 ) regs->eax = 0; else regs->eax &= ~((1u << (op_bytes * 8)) - 1); regs->eax |= guest_io_read(port, op_bytes, v, regs); } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); goto done; case 0xec: /* IN %dx,%al */ op_bytes = 1; case 0xed: /* IN %dx,%eax */ port = (u16)regs->edx; goto exec_in; case 0xe6: /* OUT %al,imm8 */ op_bytes = 1; case 0xe7: /* OUT %eax,imm8 */ port = insn_fetch(u8, code_base, eip, code_limit); io_emul_stub[14] = port; /* imm8 */ exec_out: if ( !guest_io_okay(port, op_bytes, v, regs) ) goto fail; if ( admin_io_okay(port, op_bytes, v, regs) ) { if ( (op_bytes == 1) && ((port == 0x71) || (port == 0x70)) && pv_rtc_handler ) pv_rtc_handler(port, regs->eax); io_emul(regs); if ( (op_bytes == 1) && pv_post_outb_hook ) pv_post_outb_hook(port, regs->eax); } else { guest_io_write(port, op_bytes, regs->eax, v, regs); } bpmatch = check_guest_io_breakpoint(v, port, op_bytes); goto done; case 0xee: /* OUT %al,%dx */ op_bytes = 1; case 0xef: /* OUT %eax,%dx */ port = (u16)regs->edx; goto exec_out; case 0xfa: /* CLI */ case 0xfb: /* STI */ if ( v->arch.pv_vcpu.iopl < (guest_kernel_mode(v, regs) ? 1 : 3) ) goto fail; /* * This is just too dangerous to allow, in my opinion. Consider if the * caller then tries to reenable interrupts using POPF: we can't trap * that and we'll end up with hard-to-debug lockups. Fast & loose will * do for us. :-) */ /*v->vcpu_info->evtchn_upcall_mask = (opcode == 0xfa);*/ goto done; } /* No decode of this single-byte opcode. */ goto fail; twobyte_opcode: /* * All 2 and 3 byte opcodes, except RDTSC (0x31) and RDTSCP (0x1,0xF9) * are executable only from guest kernel mode (virtual ring 0). */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( !guest_kernel_mode(v, regs) && (opcode != 0x1) && (opcode != 0x31) ) goto fail; if ( lock && (opcode & ~3) != 0x20 ) goto fail; switch ( opcode ) { case 0x1: /* RDTSCP and XSETBV */ switch ( insn_fetch(u8, code_base, eip, code_limit) ) { case 0xf9: /* RDTSCP */ if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) && !guest_kernel_mode(v, regs) ) goto fail; pv_soft_rdtsc(v, regs, 1); break; case 0xd1: /* XSETBV */ { u64 new_xfeature = (u32)regs->eax | ((u64)regs->edx << 32); if ( lock || rep_prefix || opsize_prefix || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ) { do_guest_trap(TRAP_invalid_op, regs, 0); goto skip; } if ( !guest_kernel_mode(v, regs) ) goto fail; switch ( (u32)regs->ecx ) { case XCR_XFEATURE_ENABLED_MASK: /* bit 0 of XCR0 must be set and reserved bit must not be set */ if ( !(new_xfeature & XSTATE_FP) || (new_xfeature & ~xfeature_mask) ) goto fail; v->arch.xcr0 = new_xfeature; v->arch.xcr0_accum |= new_xfeature; set_xcr0(new_xfeature); break; default: goto fail; } break; } default: goto fail; } break; case 0x06: /* CLTS */ (void)do_fpu_taskswitch(0); break; case 0x09: /* WBINVD */ /* Ignore the instruction if unprivileged. */ if ( !cache_flush_permitted(v->domain) ) /* Non-physdev domain attempted WBINVD; ignore for now since newer linux uses this in some start-of-day timing loops */ ; else wbinvd(); break; case 0x20: /* MOV CR?,<reg> */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); switch ( modrm_reg ) { case 0: /* Read CR0 */ *reg = (read_cr0() & ~X86_CR0_TS) | v->arch.pv_vcpu.ctrlreg[0]; break; case 2: /* Read CR2 */ *reg = v->arch.pv_vcpu.ctrlreg[2]; break; case 3: /* Read CR3 */ { unsigned long mfn; if ( !is_pv_32on64_vcpu(v) ) { mfn = pagetable_get_pfn(v->arch.guest_table); *reg = xen_pfn_to_cr3(mfn_to_gmfn( v->domain, mfn)); } #ifdef CONFIG_COMPAT else { mfn = l4e_get_pfn(*(l4_pgentry_t *)__va(pagetable_get_paddr(v->arch.guest_table))); *reg = compat_pfn_to_cr3(mfn_to_gmfn( v->domain, mfn)); } #endif /* PTs should not be shared */ BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow); } break; case 4: /* Read CR4 */ *reg = v->arch.pv_vcpu.ctrlreg[4]; break; default: goto fail; } break; case 0x21: /* MOV DR?,<reg> */ { unsigned long res; opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); if ( (res = do_get_debugreg(modrm_reg)) > (unsigned long)-256 ) goto fail; *reg = res; break; } case 0x22: /* MOV <reg>,CR? */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); switch ( modrm_reg ) { case 0: /* Write CR0 */ if ( (*reg ^ read_cr0()) & ~X86_CR0_TS ) { gdprintk(XENLOG_WARNING, "Attempt to change unmodifiable CR0 flags.\n"); goto fail; } (void)do_fpu_taskswitch(!!(*reg & X86_CR0_TS)); break; case 2: /* Write CR2 */ v->arch.pv_vcpu.ctrlreg[2] = *reg; arch_set_cr2(v, *reg); break; case 3: /* Write CR3 */ domain_lock(v->domain); if ( !is_pv_32on64_vcpu(v) ) rc = new_guest_cr3(gmfn_to_mfn(v->domain, xen_cr3_to_pfn(*reg))); #ifdef CONFIG_COMPAT else rc = new_guest_cr3(gmfn_to_mfn(v->domain, compat_cr3_to_pfn(*reg))); #endif domain_unlock(v->domain); if ( rc == 0 ) /* not okay */ goto fail; break; case 4: /* Write CR4 */ v->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(v, *reg); write_cr4(pv_guest_cr4_to_real_cr4(v)); break; default: goto fail; } break; case 0x23: /* MOV <reg>,DR? */ opcode = insn_fetch(u8, code_base, eip, code_limit); if ( opcode < 0xc0 ) goto fail; modrm_reg += ((opcode >> 3) & 7) + (lock << 3); modrm_rm |= (opcode >> 0) & 7; reg = decode_register(modrm_rm, regs, 0); if ( do_set_debugreg(modrm_reg, *reg) != 0 ) goto fail; break; case 0x30: /* WRMSR */ { uint32_t eax = regs->eax; uint32_t edx = regs->edx; msr_content = ((uint64_t)edx << 32) | eax; switch ( (u32)regs->ecx ) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; if ( wrmsr_safe(MSR_FS_BASE, msr_content) ) goto fail; v->arch.pv_vcpu.fs_base = msr_content; break; case MSR_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; if ( wrmsr_safe(MSR_GS_BASE, msr_content) ) goto fail; v->arch.pv_vcpu.gs_base_kernel = msr_content; break; case MSR_SHADOW_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; if ( wrmsr_safe(MSR_SHADOW_GS_BASE, msr_content) ) goto fail; v->arch.pv_vcpu.gs_base_user = msr_content; break; #endif case MSR_K7_FID_VID_STATUS: case MSR_K7_FID_VID_CTL: case MSR_K8_PSTATE_LIMIT: case MSR_K8_PSTATE_CTRL: case MSR_K8_PSTATE_STATUS: case MSR_K8_PSTATE0: case MSR_K8_PSTATE1: case MSR_K8_PSTATE2: case MSR_K8_PSTATE3: case MSR_K8_PSTATE4: case MSR_K8_PSTATE5: case MSR_K8_PSTATE6: case MSR_K8_PSTATE7: case MSR_K8_HWCR: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) goto fail; if ( !is_cpufreq_controller(v->domain) ) break; if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; case MSR_AMD64_NB_CFG: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) goto fail; if ( !IS_PRIV(v->domain) || !is_pinned_vcpu(v) ) break; if ( (rdmsr_safe(MSR_AMD64_NB_CFG, val) != 0) || (eax != (uint32_t)val) || ((edx ^ (val >> 32)) & ~(1 << (AMD64_NB_CFG_CF8_EXT_ENABLE_BIT - 32))) ) goto invalid; if ( wrmsr_safe(MSR_AMD64_NB_CFG, msr_content) != 0 ) goto fail; break; case MSR_FAM10H_MMIO_CONF_BASE: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) goto fail; if ( !IS_PRIV(v->domain) || !is_pinned_vcpu(v) ) break; if ( (rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) != 0) ) goto fail; if ( #ifdef CONFIG_X86_64 (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ? val != msr_content : #endif ((val ^ msr_content) & ~( FAM10H_MMIO_CONF_ENABLE | (FAM10H_MMIO_CONF_BUSRANGE_MASK << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | ((u64)FAM10H_MMIO_CONF_BASE_MASK << FAM10H_MMIO_CONF_BASE_SHIFT))) ) goto invalid; if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, msr_content) != 0 ) goto fail; break; case MSR_IA32_UCODE_REV: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; if ( !IS_PRIV(v->domain) || !is_pinned_vcpu(v) ) break; if ( rdmsr_safe(regs->ecx, val) ) goto fail; if ( msr_content ) goto invalid; break; case MSR_IA32_MISC_ENABLE: if ( rdmsr_safe(regs->ecx, val) ) goto fail; val = guest_misc_enable(val); if ( msr_content != val ) goto invalid; break; case MSR_IA32_MPERF: case MSR_IA32_APERF: if (( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) && ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) ) goto fail; if ( !is_cpufreq_controller(v->domain) ) break; if ( wrmsr_safe(regs->ecx, msr_content ) != 0 ) goto fail; break; case MSR_IA32_PERF_CTL: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; if ( !is_cpufreq_controller(v->domain) ) break; if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; case MSR_IA32_THERM_CONTROL: case MSR_IA32_ENERGY_PERF_BIAS: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) goto fail; if ( !IS_PRIV(v->domain) || !is_pinned_vcpu(v) ) break; if ( wrmsr_safe(regs->ecx, msr_content) != 0 ) goto fail; break; default: if ( wrmsr_hypervisor_regs(regs->ecx, msr_content) ) break; rc = vmce_wrmsr(regs->ecx, msr_content); if ( rc < 0 ) goto fail; if ( rc ) break; if ( (rdmsr_safe(regs->ecx, val) != 0) || (msr_content != val) ) invalid: gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %p from " "0x%016"PRIx64" to 0x%016"PRIx64".\n", _p(regs->ecx), val, msr_content); break; } break; } case 0x31: /* RDTSC */ if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_TSD) && !guest_kernel_mode(v, regs) ) goto fail; if ( v->domain->arch.vtsc ) pv_soft_rdtsc(v, regs, 0); else rdtsc(regs->eax, regs->edx); break; case 0x32: /* RDMSR */ switch ( (u32)regs->ecx ) { #ifdef CONFIG_X86_64 case MSR_FS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; regs->eax = v->arch.pv_vcpu.fs_base & 0xFFFFFFFFUL; regs->edx = v->arch.pv_vcpu.fs_base >> 32; break; case MSR_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; regs->eax = v->arch.pv_vcpu.gs_base_kernel & 0xFFFFFFFFUL; regs->edx = v->arch.pv_vcpu.gs_base_kernel >> 32; break; case MSR_SHADOW_GS_BASE: if ( is_pv_32on64_vcpu(v) ) goto fail; regs->eax = v->arch.pv_vcpu.gs_base_user & 0xFFFFFFFFUL; regs->edx = v->arch.pv_vcpu.gs_base_user >> 32; break; #endif case MSR_K7_FID_VID_CTL: case MSR_K7_FID_VID_STATUS: case MSR_K8_PSTATE_LIMIT: case MSR_K8_PSTATE_CTRL: case MSR_K8_PSTATE_STATUS: case MSR_K8_PSTATE0: case MSR_K8_PSTATE1: case MSR_K8_PSTATE2: case MSR_K8_PSTATE3: case MSR_K8_PSTATE4: case MSR_K8_PSTATE5: case MSR_K8_PSTATE6: case MSR_K8_PSTATE7: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) goto fail; if ( !is_cpufreq_controller(v->domain) ) { regs->eax = regs->edx = 0; break; } goto rdmsr_normal; case MSR_IA32_UCODE_REV: BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) { if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) ) goto fail; sync_core(); } goto rdmsr_normal; case MSR_IA32_MISC_ENABLE: if ( rdmsr_safe(regs->ecx, msr_content) ) goto fail; msr_content = guest_misc_enable(msr_content); regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); break; default: if ( rdmsr_hypervisor_regs(regs->ecx, &val) ) { rdmsr_writeback: regs->eax = (uint32_t)val; regs->edx = (uint32_t)(val >> 32); break; } rc = vmce_rdmsr(regs->ecx, &val); if ( rc < 0 ) goto fail; if ( rc ) goto rdmsr_writeback; case MSR_EFER: rdmsr_normal: /* Everyone can read the MSR space. */ /* gdprintk(XENLOG_WARNING,"Domain attempted RDMSR %p.\n", _p(regs->ecx));*/ if ( rdmsr_safe(regs->ecx, msr_content) ) goto fail; regs->eax = (uint32_t)msr_content; regs->edx = (uint32_t)(msr_content >> 32); break; } break; default: goto fail; } #undef wr_ad #undef rd_ad done: instruction_done(regs, eip, bpmatch); skip: return EXCRET_fault_fixed; fail: return 0; } static inline int check_stack_limit(unsigned int ar, unsigned int limit, unsigned int esp, unsigned int decr) { return (((esp - decr) < (esp - 1)) && (!(ar & _SEGMENT_EC) ? (esp - 1) <= limit : (esp - decr) > limit)); } static void emulate_gate_op(struct cpu_user_regs *regs) { #ifdef __x86_64__ struct vcpu *v = current; unsigned int sel, ar, dpl, nparm, opnd_sel; unsigned int op_default, op_bytes, ad_default, ad_bytes; unsigned long off, eip, opnd_off, base, limit; int jump; /* Check whether this fault is due to the use of a call gate. */ if ( !read_gate_descriptor(regs->error_code, v, &sel, &off, &ar) || (((ar >> 13) & 3) < (regs->cs & 3)) || ((ar & _SEGMENT_TYPE) != 0xc00) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !(ar & _SEGMENT_P) ) { do_guest_trap(TRAP_no_segment, regs, 1); return; } dpl = (ar >> 13) & 3; nparm = ar & 0x1f; /* * Decode instruction (and perhaps operand) to determine RPL, * whether this is a jump or a call, and the call return offset. */ if ( !read_descriptor(regs->cs, v, regs, &base, &limit, &ar, 0) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || !(ar & _SEGMENT_CODE) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } op_bytes = op_default = ar & _SEGMENT_DB ? 4 : 2; ad_default = ad_bytes = op_default; opnd_sel = opnd_off = 0; jump = -1; for ( eip = regs->eip; eip - regs->_eip < 10; ) { switch ( insn_fetch(u8, base, eip, limit) ) { case 0x66: /* operand-size override */ op_bytes = op_default ^ 6; /* switch between 2/4 bytes */ continue; case 0x67: /* address-size override */ ad_bytes = ad_default != 4 ? 4 : 2; /* switch to 2/4 bytes */ continue; case 0x2e: /* CS override */ opnd_sel = regs->cs; ASSERT(opnd_sel); continue; case 0x3e: /* DS override */ opnd_sel = read_sreg(regs, ds); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x26: /* ES override */ opnd_sel = read_sreg(regs, es); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x64: /* FS override */ opnd_sel = read_sreg(regs, fs); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x65: /* GS override */ opnd_sel = read_sreg(regs, gs); if ( !opnd_sel ) opnd_sel = dpl; continue; case 0x36: /* SS override */ opnd_sel = regs->ss; if ( !opnd_sel ) opnd_sel = dpl; continue; case 0xea: ++jump; /* FALLTHROUGH */ case 0x9a: ++jump; opnd_sel = regs->cs; opnd_off = eip; ad_bytes = ad_default; eip += op_bytes + 2; break; case 0xff: { unsigned int modrm; switch ( (modrm = insn_fetch(u8, base, eip, limit)) & 0xf8 ) { case 0x28: case 0x68: case 0xa8: ++jump; /* FALLTHROUGH */ case 0x18: case 0x58: case 0x98: ++jump; if ( ad_bytes != 2 ) { if ( (modrm & 7) == 4 ) { unsigned int sib; sib = insn_fetch(u8, base, eip, limit); modrm = (modrm & ~7) | (sib & 7); if ( (sib >>= 3) != 4 ) opnd_off = *(unsigned long *) decode_register(sib & 7, regs, 0); opnd_off <<= sib >> 3; } if ( (modrm & 7) != 5 || (modrm & 0xc0) ) opnd_off += *(unsigned long *) decode_register(modrm & 7, regs, 0); else modrm |= 0x87; if ( !opnd_sel ) { switch ( modrm & 7 ) { default: opnd_sel = read_sreg(regs, ds); break; case 4: case 5: opnd_sel = regs->ss; break; } } } else { switch ( modrm & 7 ) { case 0: case 1: case 7: opnd_off = regs->ebx; break; case 6: if ( !(modrm & 0xc0) ) modrm |= 0x80; else case 2: case 3: { opnd_off = regs->ebp; if ( !opnd_sel ) opnd_sel = regs->ss; } break; } if ( !opnd_sel ) opnd_sel = read_sreg(regs, ds); switch ( modrm & 7 ) { case 0: case 2: case 4: opnd_off += regs->esi; break; case 1: case 3: case 5: opnd_off += regs->edi; break; } } switch ( modrm & 0xc0 ) { case 0x40: opnd_off += insn_fetch(s8, base, eip, limit); break; case 0x80: opnd_off += insn_fetch(s32, base, eip, limit); break; } if ( ad_bytes == 4 ) opnd_off = (unsigned int)opnd_off; else if ( ad_bytes == 2 ) opnd_off = (unsigned short)opnd_off; break; } } break; } break; } if ( jump < 0 ) { fail: do_guest_trap(TRAP_gp_fault, regs, 1); skip: return; } if ( (opnd_sel != regs->cs && !read_descriptor(opnd_sel, v, regs, &base, &limit, &ar, 0)) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || ((ar & _SEGMENT_CODE) && !(ar & _SEGMENT_WR)) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } opnd_off += op_bytes; #define ad_default ad_bytes opnd_sel = insn_fetch(u16, base, opnd_off, limit); #undef ad_default ASSERT((opnd_sel & ~3) == regs->error_code); if ( dpl < (opnd_sel & 3) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !read_descriptor(sel, v, regs, &base, &limit, &ar, 0) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_CODE) || (!jump || (ar & _SEGMENT_EC) ? ((ar >> 13) & 3) > (regs->cs & 3) : ((ar >> 13) & 3) != (regs->cs & 3)) ) { regs->error_code = sel; do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !(ar & _SEGMENT_P) ) { regs->error_code = sel; do_guest_trap(TRAP_no_segment, regs, 1); return; } if ( off > limit ) { regs->error_code = 0; do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !jump ) { unsigned int ss, esp, *stkp; int rc; #define push(item) do \ { \ --stkp; \ esp -= 4; \ rc = __put_user(item, stkp); \ if ( rc ) \ { \ propagate_page_fault((unsigned long)(stkp + 1) - rc, \ PFEC_write_access); \ return; \ } \ } while ( 0 ) if ( ((ar >> 13) & 3) < (regs->cs & 3) ) { sel |= (ar >> 13) & 3; /* Inner stack known only for kernel ring. */ if ( (sel & 3) != GUEST_KERNEL_RPL(v->domain) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } esp = v->arch.pv_vcpu.kernel_sp; ss = v->arch.pv_vcpu.kernel_ss; if ( (ss & 3) != (sel & 3) || !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) || ((ar >> 13) & 3) != (sel & 3) || !(ar & _SEGMENT_S) || (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR) ) { regs->error_code = ss & ~3; do_guest_trap(TRAP_invalid_tss, regs, 1); return; } if ( !(ar & _SEGMENT_P) || !check_stack_limit(ar, limit, esp, (4 + nparm) * 4) ) { regs->error_code = ss & ~3; do_guest_trap(TRAP_stack_error, regs, 1); return; } stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp); if ( !compat_access_ok(stkp - 4 - nparm, (4 + nparm) * 4) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } push(regs->ss); push(regs->esp); if ( nparm ) { const unsigned int *ustkp; if ( !read_descriptor(regs->ss, v, regs, &base, &limit, &ar, 0) || ((ar >> 13) & 3) != (regs->cs & 3) || !(ar & _SEGMENT_S) || (ar & _SEGMENT_CODE) || !(ar & _SEGMENT_WR) || !check_stack_limit(ar, limit, esp + nparm * 4, nparm * 4) ) return do_guest_trap(TRAP_gp_fault, regs, 1); ustkp = (unsigned int *)(unsigned long)((unsigned int)base + regs->_esp + nparm * 4); if ( !compat_access_ok(ustkp - nparm, nparm * 4) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } do { unsigned int parm; --ustkp; rc = __get_user(parm, ustkp); if ( rc ) { propagate_page_fault((unsigned long)(ustkp + 1) - rc, 0); return; } push(parm); } while ( --nparm ); } } else { sel |= (regs->cs & 3); esp = regs->esp; ss = regs->ss; if ( !read_descriptor(ss, v, regs, &base, &limit, &ar, 0) || ((ar >> 13) & 3) != (sel & 3) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } if ( !check_stack_limit(ar, limit, esp, 2 * 4) ) { regs->error_code = 0; do_guest_trap(TRAP_stack_error, regs, 1); return; } stkp = (unsigned int *)(unsigned long)((unsigned int)base + esp); if ( !compat_access_ok(stkp - 2, 2 * 4) ) { do_guest_trap(TRAP_gp_fault, regs, 1); return; } } push(regs->cs); push(eip); #undef push regs->esp = esp; regs->ss = ss; } else sel |= (regs->cs & 3); regs->cs = sel; instruction_done(regs, off, 0); #endif } asmlinkage void do_general_protection(struct cpu_user_regs *regs) { struct vcpu *v = current; unsigned long fixup; DEBUGGER_trap_entry(TRAP_gp_fault, regs); if ( regs->error_code & 1 ) goto hardware_gp; if ( !guest_mode(regs) ) goto gp_in_kernel; /* * Cunning trick to allow arbitrary "INT n" handling. * * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n> * instruction from trapping to the appropriate vector, when that might not * be expected by Xen or the guest OS. For example, that entry might be for * a fault handler (unlike traps, faults don't increment EIP), or might * expect an error code on the stack (which a software trap never * provides), or might be a hardware interrupt handler that doesn't like * being called spuriously. * * Instead, a GPF occurs with the faulting IDT vector in the error code. * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is * clear to indicate that it's a software fault, not hardware. * * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is * okay because they can only be triggered by an explicit DPL-checked * instruction. The DPL specified by the guest OS for these vectors is NOT * CHECKED!! */ if ( (regs->error_code & 3) == 2 ) { /* This fault must be due to <INT n> instruction. */ const struct trap_info *ti; unsigned char vector = regs->error_code >> 3; ti = &v->arch.pv_vcpu.trap_ctxt[vector]; if ( permit_softint(TI_GET_DPL(ti), v, regs) ) { regs->eip += 2; do_guest_trap(vector, regs, 0); return; } } else if ( is_pv_32on64_vcpu(v) && regs->error_code ) { emulate_gate_op(regs); return; } /* Emulate some simple privileged and I/O instructions. */ if ( (regs->error_code == 0) && emulate_privileged_op(regs) ) { trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->eip); return; } #if defined(__i386__) if ( VM_ASSIST(v->domain, VMASST_TYPE_4gb_segments) && (regs->error_code == 0) && gpf_emulate_4gb(regs) ) { TRACE_1D(TRC_PV_EMULATE_4GB, regs->eip); return; } #endif /* Pass on GPF as is. */ do_guest_trap(TRAP_gp_fault, regs, 1); return; gp_in_kernel: if ( likely((fixup = search_exception_table(regs->eip)) != 0) ) { dprintk(XENLOG_INFO, "GPF (%04x): %p -> %p\n", regs->error_code, _p(regs->eip), _p(fixup)); regs->eip = fixup; return; } DEBUGGER_trap_fatal(TRAP_gp_fault, regs); hardware_gp: show_execution_state(regs); panic("GENERAL PROTECTION FAULT\n[error_code=%04x]\n", regs->error_code); } static DEFINE_PER_CPU(struct softirq_trap, softirq_trap); static void nmi_mce_softirq(void) { int cpu = smp_processor_id(); struct softirq_trap *st = &per_cpu(softirq_trap, cpu); cpumask_t affinity; BUG_ON(st == NULL); BUG_ON(st->vcpu == NULL); /* Set the tmp value unconditionally, so that * the check in the iret hypercall works. */ cpumask_copy(st->vcpu->cpu_affinity_tmp, st->vcpu->cpu_affinity); if ((cpu != st->processor) || (st->processor != st->vcpu->processor)) { /* We are on a different physical cpu. * Make sure to wakeup the vcpu on the * specified processor. */ cpus_clear(affinity); cpu_set(st->processor, affinity); vcpu_set_affinity(st->vcpu, &affinity); /* Affinity is restored in the iret hypercall. */ } /* Only used to defer wakeup of domain/vcpu to * a safe (non-NMI/MCE) context. */ vcpu_kick(st->vcpu); st->vcpu = NULL; } void async_exception_cleanup(struct vcpu *curr) { int trap; if ( !curr->async_exception_mask ) return; /* Restore affinity. */ if ( !cpumask_empty(curr->cpu_affinity_tmp) && !cpumask_equal(curr->cpu_affinity_tmp, curr->cpu_affinity) ) { vcpu_set_affinity(curr, curr->cpu_affinity_tmp); cpumask_clear(curr->cpu_affinity_tmp); } if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) ) trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE); else for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap ) if ( (curr->async_exception_mask ^ curr->async_exception_state(trap).old_mask) == (1 << trap) ) break; ASSERT(trap <= VCPU_TRAP_LAST); /* inject vMCE to PV_Guest including DOM0. */ if ( trap == VCPU_TRAP_MCE ) { gdprintk(XENLOG_DEBUG, "MCE: Return from vMCE# trap!\n"); if ( curr->vcpu_id == 0 ) { struct domain *d = curr->domain; if ( !d->arch.vmca_msrs->nr_injection ) { printk(XENLOG_WARNING "MCE: ret from vMCE#, " "no injection node\n"); goto end; } d->arch.vmca_msrs->nr_injection--; if ( !list_empty(&d->arch.vmca_msrs->impact_header) ) { struct bank_entry *entry; entry = list_entry(d->arch.vmca_msrs->impact_header.next, struct bank_entry, list); gdprintk(XENLOG_DEBUG, "MCE: delete last injection node\n"); list_del(&entry->list); } else printk(XENLOG_ERR "MCE: didn't found last injection node\n"); /* further injection */ if ( d->arch.vmca_msrs->nr_injection > 0 && guest_has_trap_callback(d, 0, TRAP_machine_check) && !test_and_set_bool(curr->mce_pending) ) { int cpu = smp_processor_id(); cpumask_t affinity; cpumask_copy(curr->cpu_affinity_tmp, curr->cpu_affinity); cpus_clear(affinity); cpu_set(cpu, affinity); printk(XENLOG_DEBUG "MCE: CPU%d set affinity, old %d\n", cpu, curr->processor); vcpu_set_affinity(curr, &affinity); } } } end: /* Restore previous asynchronous exception mask. */ curr->async_exception_mask = curr->async_exception_state(trap).old_mask; } static void nmi_dom0_report(unsigned int reason_idx) { struct domain *d = dom0; if ( (d == NULL) || (d->vcpu == NULL) || (d->vcpu[0] == NULL) ) return; set_bit(reason_idx, nmi_reason(d)); send_guest_trap(d, 0, TRAP_nmi); } static void pci_serr_error(struct cpu_user_regs *regs) { console_force_unlock(); printk("\n\nNMI - PCI system error (SERR)\n"); outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */ } static void io_check_error(struct cpu_user_regs *regs) { switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ nmi_dom0_report(_XEN_NMIREASON_io_error); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ console_force_unlock(); printk("\n\nNMI - I/O ERROR\n"); fatal_trap(TRAP_nmi, regs); } outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */ mdelay(1); outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */ } static void unknown_nmi_error(unsigned char reason) { switch ( opt_nmi[0] ) { case 'd': /* 'dom0' */ nmi_dom0_report(_XEN_NMIREASON_unknown); case 'i': /* 'ignore' */ break; default: /* 'fatal' */ printk("Uhhuh. NMI received for unknown reason %02x.\n", reason); printk("Dazed and confused, but trying to continue\n"); printk("Do you have a strange power saving mode enabled?\n"); kexec_crash(); } } static int dummy_nmi_callback(struct cpu_user_regs *regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; asmlinkage void do_nmi(struct cpu_user_regs *regs) { unsigned int cpu = smp_processor_id(); unsigned char reason; ++nmi_count(cpu); if ( nmi_callback(regs, cpu) ) return; if ( nmi_watchdog ) nmi_watchdog_tick(regs); /* Only the BSP gets external NMIs from the system. */ if ( cpu == 0 ) { reason = inb(0x61); if ( reason & 0x80 ) pci_serr_error(regs); else if ( reason & 0x40 ) io_check_error(regs); else if ( !nmi_watchdog ) unknown_nmi_error((unsigned char)(reason&0xff)); } } void set_nmi_callback(nmi_callback_t callback) { nmi_callback = callback; } void unset_nmi_callback(void) { nmi_callback = dummy_nmi_callback; } asmlinkage void do_device_not_available(struct cpu_user_regs *regs) { struct vcpu *curr = current; BUG_ON(!guest_mode(regs)); vcpu_restore_fpu_lazy(curr); if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS ) { do_guest_trap(TRAP_no_device, regs, 0); curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS; } else TRACE_0D(TRC_PV_MATH_STATE_RESTORE); return; } u64 read_efer(void) { return this_cpu(efer); } void write_efer(u64 val) { this_cpu(efer) = val; wrmsrl(MSR_EFER, val); } static void ler_enable(void) { u64 debugctl; if ( !this_cpu(ler_msr) ) return; rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | 1); } asmlinkage void do_debug(struct cpu_user_regs *regs) { struct vcpu *v = current; DEBUGGER_trap_entry(TRAP_debug, regs); if ( !guest_mode(regs) ) { if ( regs->eflags & X86_EFLAGS_TF ) { #ifdef __x86_64__ /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */ if ( (regs->rip >= (unsigned long)sysenter_entry) && (regs->rip <= (unsigned long)sysenter_eflags_saved) ) { if ( regs->rip == (unsigned long)sysenter_eflags_saved ) regs->eflags &= ~X86_EFLAGS_TF; goto out; } #endif if ( !debugger_trap_fatal(TRAP_debug, regs) ) { WARN_ON(1); regs->eflags &= ~X86_EFLAGS_TF; } } else { /* * We ignore watchpoints when they trigger within Xen. This may * happen when a buffer is passed to us which previously had a * watchpoint set on it. No need to bump EIP; the only faulting * trap is an instruction breakpoint, which can't happen to us. */ WARN_ON(!search_exception_table(regs->eip)); } goto out; } /* Save debug status register where guest OS can peek at it */ v->arch.debugreg[6] = read_debugreg(6); ler_enable(); do_guest_trap(TRAP_debug, regs, 0); return; out: ler_enable(); return; } asmlinkage void do_spurious_interrupt_bug(struct cpu_user_regs *regs) { } static void __set_intr_gate(unsigned int n, uint32_t dpl, void *addr) { int i; /* Keep secondary tables in sync with IRQ updates. */ for ( i = 1; i < NR_CPUS; i++ ) if ( idt_tables[i] != NULL ) _set_gate(&idt_tables[i][n], 14, dpl, addr); _set_gate(&idt_table[n], 14, dpl, addr); } static void set_swint_gate(unsigned int n, void *addr) { __set_intr_gate(n, 3, addr); } void set_intr_gate(unsigned int n, void *addr) { __set_intr_gate(n, 0, addr); } void load_TR(void) { struct tss_struct *tss = &this_cpu(init_tss); struct desc_ptr old_gdt, tss_gdt = { .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY), .limit = LAST_RESERVED_GDT_BYTE }; _set_tssldt_desc( this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, offsetof(struct tss_struct, __cacheline_filler) - 1, 9); #ifdef CONFIG_COMPAT _set_tssldt_desc( this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY, (unsigned long)tss, offsetof(struct tss_struct, __cacheline_filler) - 1, 11); #endif /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */ asm volatile ( "sgdt %0; lgdt %2; ltr %w1; lgdt %0" : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" ); } void __devinit percpu_traps_init(void) { subarch_percpu_traps_init(); if ( !opt_ler ) return; switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: switch ( boot_cpu_data.x86 ) { case 6: this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; break; case 15: this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP; break; } break; case X86_VENDOR_AMD: switch ( boot_cpu_data.x86 ) { case 6: case 0xf ... 0x17: this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP; break; } break; } ler_enable(); } void __init trap_init(void) { /* * Note that interrupt gates are always used, rather than trap gates. We * must have interrupts disabled until DS/ES/FS/GS are saved because the * first activation must have the "bad" value(s) for these registers and * we may lose them if another activation is installed before they are * saved. The page-fault handler also needs interrupts disabled until %cr2 * has been read and saved on the stack. */ set_intr_gate(TRAP_divide_error,&divide_error); set_intr_gate(TRAP_debug,&debug); set_intr_gate(TRAP_nmi,&nmi); set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */ set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */ set_intr_gate(TRAP_bounds,&bounds); set_intr_gate(TRAP_invalid_op,&invalid_op); set_intr_gate(TRAP_no_device,&device_not_available); set_intr_gate(TRAP_copro_seg,&coprocessor_segment_overrun); set_intr_gate(TRAP_invalid_tss,&invalid_TSS); set_intr_gate(TRAP_no_segment,&segment_not_present); set_intr_gate(TRAP_stack_error,&stack_segment); set_intr_gate(TRAP_gp_fault,&general_protection); set_intr_gate(TRAP_page_fault,&page_fault); set_intr_gate(TRAP_spurious_int,&spurious_interrupt_bug); set_intr_gate(TRAP_copro_error,&coprocessor_error); set_intr_gate(TRAP_alignment_check,&alignment_check); set_intr_gate(TRAP_machine_check,&machine_check); set_intr_gate(TRAP_simd_error,&simd_coprocessor_error); /* CPU0 uses the master IDT. */ idt_tables[0] = idt_table; this_cpu(gdt_table) = boot_cpu_gdt_table; #ifdef CONFIG_COMPAT this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table; #endif percpu_traps_init(); cpu_init(); open_softirq(NMI_MCE_SOFTIRQ, nmi_mce_softirq); } long register_guest_nmi_callback(unsigned long address) { struct vcpu *v = current; struct domain *d = v->domain; struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi]; t->vector = TRAP_nmi; t->flags = 0; t->cs = (is_pv_32on64_domain(d) ? FLAT_COMPAT_KERNEL_CS : FLAT_KERNEL_CS); t->address = address; TI_SET_IF(t, 1); /* * If no handler was registered we can 'lose the NMI edge'. Re-assert it * now. */ if ( (v->vcpu_id == 0) && (arch_get_nmi_reason(d) != 0) ) v->nmi_pending = 1; return 0; } long unregister_guest_nmi_callback(void) { struct vcpu *v = current; struct trap_info *t = &v->arch.pv_vcpu.trap_ctxt[TRAP_nmi]; memset(t, 0, sizeof(*t)); return 0; } int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr) { struct vcpu *v; struct trap_info *t; BUG_ON(d == NULL); BUG_ON(vcpuid >= d->max_vcpus); /* Sanity check - XXX should be more fine grained. */ BUG_ON(trap_nr > TRAP_syscall); v = d->vcpu[vcpuid]; t = &v->arch.pv_vcpu.trap_ctxt[trap_nr]; return (t->address != 0); } int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr) { struct vcpu *v; struct softirq_trap *st = &per_cpu(softirq_trap, smp_processor_id()); BUG_ON(d == NULL); BUG_ON(vcpuid >= d->max_vcpus); v = d->vcpu[vcpuid]; switch (trap_nr) { case TRAP_nmi: if ( cmpxchgptr(&st->vcpu, NULL, v) ) return -EBUSY; if ( !test_and_set_bool(v->nmi_pending) ) { st->domain = d; st->processor = v->processor; /* not safe to wake up a vcpu here */ raise_softirq(NMI_MCE_SOFTIRQ); return 0; } st->vcpu = NULL; break; case TRAP_machine_check: if ( cmpxchgptr(&st->vcpu, NULL, v) ) return -EBUSY; /* We are called by the machine check (exception or polling) handlers * on the physical CPU that reported a machine check error. */ if ( !test_and_set_bool(v->mce_pending) ) { st->domain = d; st->vcpu = v; st->processor = v->processor; /* not safe to wake up a vcpu here */ raise_softirq(NMI_MCE_SOFTIRQ); return 0; } st->vcpu = NULL; break; } /* delivery failed */ return -EIO; } long do_set_trap_table(XEN_GUEST_HANDLE(const_trap_info_t) traps) { struct trap_info cur; struct vcpu *curr = current; struct trap_info *dst = curr->arch.pv_vcpu.trap_ctxt; long rc = 0; /* If no table is presented then clear the entire virtual IDT. */ if ( guest_handle_is_null(traps) ) { memset(dst, 0, 256 * sizeof(*dst)); init_int80_direct_trap(curr); return 0; } for ( ; ; ) { if ( hypercall_preempt_check() ) { rc = hypercall_create_continuation( __HYPERVISOR_set_trap_table, "h", traps); break; } if ( copy_from_guest(&cur, traps, 1) ) { rc = -EFAULT; break; } if ( cur.address == 0 ) break; fixup_guest_code_selector(curr->domain, cur.cs); memcpy(&dst[cur.vector], &cur, sizeof(cur)); if ( cur.vector == 0x80 ) init_int80_direct_trap(curr); guest_handle_add_offset(traps, 1); } return rc; } long set_debugreg(struct vcpu *v, int reg, unsigned long value) { int i; struct vcpu *curr = current; switch ( reg ) { case 0: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(0, value); break; case 1: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(1, value); break; case 2: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(2, value); break; case 3: if ( !access_ok(value, sizeof(long)) ) return -EPERM; if ( v == curr ) write_debugreg(3, value); break; case 6: /* * DR6: Bits 4-11,16-31 reserved (set to 1). * Bit 12 reserved (set to 0). */ value &= 0xffffefff; /* reserved bits => 0 */ value |= 0xffff0ff0; /* reserved bits => 1 */ if ( v == curr ) write_debugreg(6, value); break; case 7: /* * DR7: Bit 10 reserved (set to 1). * Bits 11-12,14-15 reserved (set to 0). */ value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */ value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */ /* * Privileged bits: * GD (bit 13): must be 0. */ if ( value & DR_GENERAL_DETECT ) return -EPERM; /* DR7.{G,L}E = 0 => debugging disabled for this domain. */ if ( value & DR7_ACTIVE_MASK ) { unsigned int io_enable = 0; for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE ) { if ( ((value >> i) & 3) == DR_IO ) { if ( !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) return -EPERM; io_enable |= value & (3 << ((i - 16) >> 1)); } #ifdef __i386__ if ( ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) || !boot_cpu_has(X86_FEATURE_LM)) && (((value >> i) & 0xc) == DR_LEN_8) ) return -EPERM; #endif } /* Guest DR5 is a handy stash for I/O intercept information. */ v->arch.debugreg[5] = io_enable; value &= ~io_enable; /* * If DR7 was previously clear then we need to load all other * debug registers at this point as they were not restored during * context switch. */ if ( (v == curr) && !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) ) { write_debugreg(0, v->arch.debugreg[0]); write_debugreg(1, v->arch.debugreg[1]); write_debugreg(2, v->arch.debugreg[2]); write_debugreg(3, v->arch.debugreg[3]); write_debugreg(6, v->arch.debugreg[6]); } } if ( v == curr ) write_debugreg(7, value); break; default: return -EINVAL; } v->arch.debugreg[reg] = value; return 0; } long do_set_debugreg(int reg, unsigned long value) { return set_debugreg(current, reg, value); } unsigned long do_get_debugreg(int reg) { struct vcpu *curr = current; switch ( reg ) { case 0 ... 3: case 6: return curr->arch.debugreg[reg]; case 7: return (curr->arch.debugreg[7] | curr->arch.debugreg[5]); case 4 ... 5: return ((curr->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ? curr->arch.debugreg[reg + 2] : 0); } return -EINVAL; } /* * Local variables: * mode: C * c-set-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */