19 using System.Collections.Generic;
23 using System.Threading.Tasks;
27 internal class HtmlTag
29 public Dictionary<string, string> Attributes {
get; }
30 public string Tag {
get; }
32 private HtmlTag(
string tag, Dictionary<string, string> attributes)
35 this.Attributes = attributes;
38 private static HtmlTag ParseTag(StringReader reader)
40 StringBuilder tagBuilder =
new StringBuilder();
42 int character = reader.Read();
44 while (character >= 0 && (
char)character !=
'<')
46 character = reader.Read();
49 if ((
char)character ==
'<')
51 character = reader.Read();
54 while (character >= 0 &&
char.IsWhiteSpace((
char)character))
56 character = reader.Read();
59 while (character >= 0 && !
char.IsWhiteSpace((
char)character) && (
char)character !=
'>')
61 tagBuilder.Append((
char)character);
62 character = reader.Read();
65 string tag = tagBuilder.ToString();
67 Dictionary<string, string> attributes =
new Dictionary<string, string>();
69 (string, string)? attribute = ReadAttribute(reader, ref character);
71 while (attribute !=
null && character >= 0)
73 attributes[attribute.Value.Item1] = attribute.Value.Item2;
74 attribute = ReadAttribute(reader, ref character);
77 return new HtmlTag(tag, attributes);
80 public static IEnumerable<HtmlTag> ParseTagsUntil(StringReader reader,
string targetTag)
82 HtmlTag tag = ParseTag(reader);
84 while (tag.Tag != targetTag && reader.Peek() >= 0)
86 if (tag.Tag.Equals(
"p", StringComparison.OrdinalIgnoreCase))
88 foreach (HtmlTag nestedTag
in ParseTagsUntil(reader,
"/p"))
90 if (nestedTag.Tag !=
"/p")
92 foreach (KeyValuePair<string, string> kvp
in tag.Attributes)
94 if (!nestedTag.Attributes.ContainsKey(kvp.Key))
96 nestedTag.Attributes[kvp.Key] = kvp.Value;
100 yield
return nestedTag;
110 tag = ParseTag(reader);
116 public static IEnumerable<HtmlTag> Parse(
string html)
118 using (StringReader reader =
new StringReader(html))
120 while (reader.Peek() >= 0)
122 HtmlTag tag = ParseTag(reader);
124 if (tag.Tag.Equals(
"p", StringComparison.OrdinalIgnoreCase))
126 foreach (HtmlTag nestedTag
in ParseTagsUntil(reader,
"/p"))
128 if (nestedTag.Tag !=
"/p")
130 foreach (KeyValuePair<string, string> kvp
in tag.Attributes)
132 if (!nestedTag.Attributes.ContainsKey(kvp.Key))
134 nestedTag.Attributes[kvp.Key] = kvp.Value;
138 yield
return nestedTag;
150 private static (string, string)? ReadAttribute(StringReader reader, ref
int character)
152 while (character >= 0 &&
char.IsWhiteSpace((
char)character) && (char)character !=
'>')
154 character = reader.Read();
157 if ((
char)character ==
'>')
163 StringBuilder attributeNameBuilder =
new StringBuilder();
165 while (character >= 0 && !
char.IsWhiteSpace((
char)character) && (
char)character !=
'>' && (
char)character !=
'=')
167 attributeNameBuilder.Append((
char)character);
168 character = reader.Read();
171 string attributeName = attributeNameBuilder.ToString();
173 while (character >= 0 &&
char.IsWhiteSpace((
char)character) && (
char)character !=
'>' && (
char)character !=
'=')
175 character = reader.Read();
178 if ((
char)character ==
'=')
180 character = reader.Read();
182 while (character >= 0 &&
char.IsWhiteSpace((
char)character) && (
char)character !=
'>')
184 character = reader.Read();
187 if ((
char)character ==
'>')
189 return (attributeName,
null);
193 bool quoted = (char)character ==
'"' || (
char)character ==
'\'';
197 char quoteChar = (char)character;
199 character = reader.Read();
201 StringBuilder attributeValueBuilder =
new StringBuilder();
203 bool isEscaped = (char)character ==
'\\';
205 while (character >= 0 && ((
char)character != quoteChar || isEscaped))
207 attributeValueBuilder.Append((
char)character);
208 character = reader.Read();
209 isEscaped = (char)character ==
'\\' && !isEscaped;
212 string attributeValue = attributeValueBuilder.ToString();
214 return (attributeName, attributeValue);
218 StringBuilder attributeValueBuilder =
new StringBuilder();
220 while (character >= 0 && !
char.IsWhiteSpace((
char)character) && (
char)character !=
'>' && (
char)character !=
'=')
222 attributeValueBuilder.Append((
char)character);
223 character = reader.Read();
226 string attributeValue = attributeValueBuilder.ToString();
228 return (attributeName, attributeValue);
234 return (attributeName,
null);
257 public static (
string path,
bool wasDownloaded) ResolveImageURI(
string uri,
string baseUriString)
259 if (uri.StartsWith(
"data:"))
261 string tempFile = Path.GetTempFileName();
262 if (File.Exists(tempFile))
264 File.Delete(tempFile);
267 Directory.CreateDirectory(tempFile);
272 return (Path.Combine(tempFile,
"temp.svg"),
true);
274 else if (File.Exists(Path.Combine(baseUriString, uri)))
276 return (Path.Combine(baseUriString, uri),
false);
278 else if (File.Exists(uri))
287 if (Uri.TryCreate(baseUriString, UriKind.Absolute, out Uri baseUri))
289 validUri = Uri.TryCreate(baseUri, uri, out absoluteUri);
293 validUri = Uri.TryCreate(uri, UriKind.Absolute, out absoluteUri);
298 string tempFile = Path.GetTempFileName();
299 File.Delete(tempFile);
300 Directory.CreateDirectory(tempFile);
302 string fileDest = Path.Combine(tempFile, Path.GetFileName(absoluteUri.LocalPath));
308 Console.Error.WriteLine();
309 Console.Error.Write(
"Downloading {0}...", absoluteUri);
312 using (WebClient client =
new WebClient())
314 client.DownloadFile(absoluteUri, fileDest);
319 Console.Error.WriteLine(
" Done.");
322 string newName = FixFileExtensionBasedOnContent(fileDest);
324 File.Move(fileDest, newName);
327 return (fileDest,
true);
333 Console.Error.WriteLine(
" Failed!");
334 Console.Error.WriteLine(ex.Message);
337 Directory.Delete(tempFile,
true);
338 return (
null,
false);
343 return (
null,
false);
348 private static string FixFileExtensionBasedOnContent(
string fileName)
350 using (FileStream fileStream = File.OpenRead(fileName))
356 using (var xmlReader = System.Xml.XmlReader.Create(fileStream))
358 isSvg = xmlReader.MoveToContent() == System.Xml.XmlNodeType.Element &&
"svg".Equals(xmlReader.Name, StringComparison.OrdinalIgnoreCase);
368 return fileName +
".svg";
372 fileStream.Seek(0, SeekOrigin.Begin);
373 byte[] header =
new byte[8];
375 for (
int i = 0; i < header.Length; i++)
377 header[i] = (byte)fileStream.ReadByte();
380 if (header[0] == 0x42 && header[1] == 0x4D)
382 return fileName +
".bmp";
384 else if (header[0] == 0x47 && header[1] == 0x49 && header[2] == 0x46 && header[3] == 0x38)
386 return fileName +
".gif";
388 else if (header[0] == 0xFF && header[1] == 0xD8 && header[2] == 0xFF && (header[3] == 0xDB || header[3] == 0xE0 || header[3] == 0xEE || header[3] == 0xE1))
390 return fileName +
".jpg";
392 else if (header[0] == 0x25 && header[1] == 0x50 && header[2] == 0x44 && header[3] == 0x46 && header[4] == 0x2D)
394 return fileName +
".pdf";
396 else if (header[0] == 0x89 && header[1] == 0x50 && header[2] == 0x4E && header[3] == 0x47 && header[4] == 0x0D && header[5] == 0x0A && header[6] == 0x1A && header[7] == 0x0A)
398 return fileName +
".png";
400 else if ((header[0] == 0x49 && header[1] == 0x49 && header[2] == 0x2A && header[3] == 0x00) || (header[0] == 0x4D && header[1] == 0x4D && header[2] == 0x00 && header[3] == 0x2A))
402 return fileName +
".tif";