diff options
| -rw-r--r-- | src/libraries.adb | 9 | ||||
| -rw-r--r-- | src/vhdl/scanner.adb | 49 | ||||
| -rw-r--r-- | src/vhdl/scanner.ads | 5 | 
3 files changed, 56 insertions, 7 deletions
| diff --git a/src/libraries.adb b/src/libraries.adb index 9852c5261..0cca4d0b0 100644 --- a/src/libraries.adb +++ b/src/libraries.adb @@ -1353,8 +1353,15 @@ package body Libraries is        Res : Iir_Design_File;     begin        Scanner.Set_File (File); -      Res := Parse.Parse_Design_File; +      if Scanner.Detect_Encoding_Errors then +         --  Don't even try to parse such a file.  The BOM will be interpreted +         --  as an identifier, which is not valid at the beginning of a file. +         Res := Null_Iir; +      else +         Res := Parse.Parse_Design_File; +      end if;        Scanner.Close_File; +        if Res /= Null_Iir then           Set_Parent (Res, Work_Library);           Set_Design_File_Filename (Res, Files_Map.Get_File_Name (File)); diff --git a/src/vhdl/scanner.adb b/src/vhdl/scanner.adb index f18723d1f..26dff5e9f 100644 --- a/src/vhdl/scanner.adb +++ b/src/vhdl/scanner.adb @@ -268,12 +268,8 @@ package body Scanner is     is        N_Source: File_Buffer_Acc;     begin -      if Current_Context.Source /= null then -         raise Internal_Error; -      end if; -      if Source_File = No_Source_File_Entry then -         raise Internal_Error; -      end if; +      pragma Assert (Current_Context.Source = null); +      pragma Assert (Source_File /= No_Source_File_Entry);        N_Source := Get_File_Source (Source_File);        Current_Context := (Source => N_Source,                            Source_File => Source_File, @@ -293,6 +289,47 @@ package body Scanner is        Current_Token := Tok_Invalid;     end Set_File; +   function Detect_Encoding_Errors return Boolean +   is +      C : constant Character := Source (Pos); +   begin +      --  No need to check further if first character is plain ASCII-7 +      if C >= ' ' and C < Character'Val (127) then +         return False; +      end if; + +      --  UTF-8 BOM is EF BB BF +      if Source (Pos + 0) = Character'Val (16#ef#) +        and then Source (Pos + 1) = Character'Val (16#bb#) +        and then Source (Pos + 2) = Character'Val (16#bf#) +      then +         Error_Msg_Scan +           ("source encoding must be latin-1 (UTF-8 BOM detected)"); +         return True; +      end if; + +      --  UTF-16 BE BOM is FE FF +      if Source (Pos + 0) = Character'Val (16#fe#) +        and then Source (Pos + 1) = Character'Val (16#ff#) +      then +         Error_Msg_Scan +           ("source encoding must be latin-1 (UTF-16 BE BOM detected)"); +         return True; +      end if; + +      --  UTF-16 LE BOM is FF FE +      if Source (Pos + 0) = Character'Val (16#ff#) +        and then Source (Pos + 1) = Character'Val (16#fe#) +      then +         Error_Msg_Scan +           ("source encoding must be latin-1 (UTF-16 LE BOM detected)"); +         return True; +      end if; + +      --  Certainly weird, but scanner/parser will catch it. +      return False; +   end Detect_Encoding_Errors; +     procedure Set_Current_Position (Position: Source_Ptr)     is        Loc : Location_Type; diff --git a/src/vhdl/scanner.ads b/src/vhdl/scanner.ads index 3edc9c0ba..6a5e1cf90 100644 --- a/src/vhdl/scanner.ads +++ b/src/vhdl/scanner.ads @@ -62,6 +62,11 @@ package Scanner is     -- Initialize the scanner with file SOURCE_FILE.     procedure Set_File (Source_File : Source_File_Entry); +   --  This function can be called just after Set_File to detect UTF BOM +   --  patterns.  It reports an error if a BOM is present and return True. +   --  Silently return False if no error detected. +   function Detect_Encoding_Errors return Boolean; +     procedure Set_Current_Position (Position: Source_Ptr);     -- Finalize the scanner. | 
