Strip more BOMs

The 01a sample text file I created in VS2022 has a big-endian utf-16 byte order mark in front. When read with File.Read* methods it was fine, but when piped in from the command line it was causing the integer parsing to fail due to the extra bytes on the front. This is the cleanest way I can find to strip any of a set of BOMs from a string. Maybe there's an easier way somewhere, but the main issue is that using something like line[0..1].SequenceEqual() may or may not use wide chars (0xFEFF as opposed to 0xFE, 0xFF, for example) so I can't just build a list of preamble arrays and check if the byte version of the string starts with them.

The "StripPreamble" character list is still a mystery. I found it while working on my Macbook for aoc2021 and can't seem to find any search results indicating what these bytes represent, so...just gonna leave it there.
This commit is contained in:
2022-12-01 10:22:47 -06:00
parent c1757500da
commit 483f1f2502

View File

@ -7,6 +7,7 @@ namespace aoc2022;
internal static class Util
{
private static readonly char[] StripPreamble = { (char)8745, (char)9559, (char)9488, };
private static readonly Encoding[] StripBOMsFromEncodings = { Encoding.UTF8, Encoding.Unicode, Encoding.BigEndianUnicode, };
private static void ReadData(string inputName, Action<string> processor)
{
if (Console.IsInputRedirected)
@ -16,15 +17,20 @@ internal static class Util
{
if (i == 0)
{
var preamble = Encoding.UTF8.GetPreamble();
if (line[0..preamble.Length].SequenceEqual(preamble.Select(x => (char)x)))
{
line = line[preamble.Length..];
}
else if (line[0..StripPreamble.Length].ToCharArray().SequenceEqual(StripPreamble))
if (line[0..StripPreamble.Length].SequenceEqual(StripPreamble))
{
line = line[StripPreamble.Length..];
}
else
{
foreach (var encoding in StripBOMsFromEncodings)
{
if (line.StartsWith(encoding.GetString(encoding.GetPreamble()), StringComparison.Ordinal))
{
line = line[encoding.GetPreamble().Length..];
}
}
}
}
processor(line);
if (!string.IsNullOrEmpty(line))