1.1 --- a/SatIndex.cs Thu Oct 04 19:25:35 2018 +0200
1.2 +++ b/SatIndex.cs Sat Oct 06 14:07:31 2018 +0200
1.3 @@ -1,12 +1,470 @@
1.4 -using System;
1.5 +using CsQuery;
1.6 +using System;
1.7 using System.Collections.Generic;
1.8 +using System.Diagnostics;
1.9 using System.Linq;
1.10 +using System.Net;
1.11 using System.Text;
1.12 +using System.Text.RegularExpressions;
1.13 using System.Threading.Tasks;
1.14
1.15 namespace SatChanGen
1.16 {
1.17 class SatIndex
1.18 {
1.19 + public static Channel ParseChannel(string aUrl)
1.20 + {
1.21 + Channel channel = new Channel();
1.22 +
1.23 + string satIndex = new WebClient().DownloadString(aUrl);
1.24 + //Debug.Write(satIndex);
1.25 + CQ dom = satIndex;
1.26 +
1.27 + channel.Name = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Name')").Next().Text());
1.28 + //Convert from default encoding to UTF8
1.29 + //We spend a lot of time trying to get this right until we found our answer in the following thread.
1.30 + //http://stackoverflow.com/questions/14057434/how-can-i-transform-string-to-utf-8-in-c
1.31 + //byte[] bytes = Encoding.Default.GetBytes(channel.Name);
1.32 + //channel.Name = Encoding.UTF8.GetString(bytes);
1.33 + //
1.34 + channel.Satellite = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Satellit')").Next().Text());
1.35 + channel.OrbitalPosition = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Position')").Next().Text());
1.36 + // Frequency, remove dots and unit
1.37 + channel.Frequency = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Frequenz')").Next().Text());
1.38 + channel.Frequency = channel.Frequency.Replace(" MHz", "").Replace(".", "");
1.39 + // Just get 'H' or 'V' I guess
1.40 + channel.Polarisation = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Polarisation')").Next().Text()).Substring(0,1);
1.41 + channel.Transponder = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Transponder:')").Next().Text());
1.42 + channel.TransponderID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Transponder ID')").Next().Text()).Replace(".", "");
1.43 + channel.Beam = "Astra";
1.44 + channel.Standard = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Art')").Next().Text());
1.45 + channel.Modulation = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Modulation')").Next().Text());
1.46 + channel.SymbolRate = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Symbolrate')").Next().Text());
1.47 + channel.SymbolRate = channel.SymbolRate.Replace(" kSym/s", "").Replace(".", "");
1.48 + channel.FEC = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('FEC')").Next().Text());
1.49 + channel.Provider = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Provider')").Next().Text());
1.50 +
1.51 + channel.Bitrate = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Video Bitrate')").Next().Text());
1.52 + channel.NetworkID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Netzwerk ID')").Next().Text());
1.53 +
1.54 + channel.SID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Service ID')").Next().Text());
1.55 + channel.VPID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Video Pid')").Next().Text());
1.56 + channel.PCR = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('PCR Pid')").Next().Text());
1.57 + channel.PMT = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('PMT Pid')").Next().Text());
1.58 + channel.TXT = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Videotext Pid')").Next().Text()).Replace(" (kein Videotext)","");
1.59 +
1.60 + // We should get 4 entries:
1.61 + // - Category
1.62 + // - Country
1.63 + // - HD/SD TV
1.64 + // - Free/Pay TV
1.65 + CQ properties = dom.Find(".standart2");
1.66 + channel.Category = WebUtility.HtmlDecode(properties[0].InnerText).Trim();
1.67 + channel.Country = WebUtility.HtmlDecode(properties[1].InnerText).Trim();
1.68 +
1.69 + return channel;
1.70 + }
1.71 +
1.72 +
1.73 + //*[@id="container"]/div[2]/table[3]/tbody/tr/td/table[2]/tbody/tr/td
1.74 + public static List<Channel> Parse(IProgress<ProgressReport> aProgress, List<Channel> aChannels, string aUrl, string aOrbitalPosition, bool aUseChannelIdForName = false, string aCategoryOverride = "")
1.75 + {
1.76 + //Create our list of channels
1.77 + List<Channel> channels = new List<Channel>();
1.78 + //To avoid duplicated name
1.79 + Dictionary<string, int> names = new Dictionary<string, int>();
1.80 +
1.81 + string satIndex = new WebClient().DownloadString(aUrl);
1.82 + //Debug.Write(satIndex);
1.83 +
1.84 + CQ dom = satIndex;
1.85 +
1.86 + CQ channelsTd = dom.Find(".freq1");
1.87 +
1.88 + ProgressReport report = new ProgressReport();
1.89 + report.Max = channelsTd.Count();
1.90 + report.Value = 0;
1.91 + aProgress.Report(report);
1.92 +
1.93 + foreach ( IDomObject td in channelsTd)
1.94 + {
1.95 + string channelUrl = "https://www.satindex.de" + td.FirstChild.GetAttribute("href");
1.96 +
1.97 + Channel channel = ParseChannel(channelUrl);
1.98 +
1.99 + //Make sure our channel name looks descent
1.100 + channel.Name = CleanChannelName(channel.Name);
1.101 + //Make sure the resulting name is unique to avoid having multiple tuning detail for a single channel
1.102 + if (names.ContainsKey(channel.Name))
1.103 + {
1.104 + names[channel.Name]++;
1.105 + channel.Name += " " + names[channel.Name];
1.106 + }
1.107 + else
1.108 + {
1.109 + names.Add(channel.Name, 1);
1.110 + }
1.111 +
1.112 + // Add it to our collection
1.113 + channels.Add(channel);
1.114 + // Report progress
1.115 + report.Value++;
1.116 + aProgress.Report(report);
1.117 + }
1.118 +
1.119 + return channels;
1.120 +
1.121 +
1.122 + //Get all the Frequency tables in our page
1.123 + // Why is this not working?
1.124 + //CQ sats = dom["#container > div:nth-child(2) > table:nth-child(16) > tbody > tr > td > table:nth-child(8) > tbody > tr > td > table"];
1.125 + // As a workaround we did the following
1.126 + CQ sats = dom["#container"]["div:nth-child(2)"]["table:nth-child(16)"]["tbody"]["tr"]["td"]["table:nth-child(8)"]["tbody"]["tr"]["td"]["table"];
1.127 +
1.128 + List<IDomObject> transponders = sats.ToList();
1.129 +
1.130 +
1.131 +
1.132 + foreach (IDomObject frq in transponders)
1.133 + {
1.134 + Channel common = new Channel();
1.135 +
1.136 + //Parse channel details
1.137 + //common.OrbitalPosition = aOrbitalPosition;
1.138 + //string wsm1 = WebUtility.HtmlDecode(frq.Cq().Find(".wsm1").Get(0).InnerText).Trim();
1.139 +
1.140 + /*
1.141 + common.Satellite = "Astra 19.2° East";
1.142 + common.Frequency = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(3)").Get(0).InnerText);
1.143 + common.Polarisation = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(4)").Get(0).InnerText);
1.144 + common.Transponder = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(5) > a").Get(0).InnerText);
1.145 + common.Beam = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(6) > a").Get(0).InnerText);
1.146 + common.Standard = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(7)").Get(0).InnerText);
1.147 + common.Modulation = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(8)").Get(0).InnerText);
1.148 + common.SymbolRate = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(9) > a").Get(0).InnerText);
1.149 + common.FEC = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(9) > a:nth-child(2)").Get(0).InnerText);
1.150 + try
1.151 + {
1.152 + common.Provider = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(10) > b").Get(0).InnerText);
1.153 + }
1.154 + catch (Exception)
1.155 + {
1.156 + }
1.157 +
1.158 + common.Bitrate = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(10)").Get(0).InnerText);
1.159 + if (common.Bitrate.Substring(0, ", ".Length) == ", ")
1.160 + {
1.161 + common.Bitrate = common.Bitrate.Substring(", ".Length, common.Bitrate.Length - ", ".Length);
1.162 + }
1.163 + //
1.164 + common.NetworkID = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(11)").Get(0).InnerText);
1.165 + //common.NetworkID = common.NetworkID.Substring("NID:".Length, common.NetworkID.Length - "NID:".Length);
1.166 + //
1.167 + common.TransponderID = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(12)").Get(0).InnerText);
1.168 + //common.TransponderID = common.TransponderID.Substring("TID:".Length, common.TransponderID.Length - "TID:".Length);
1.169 +
1.170 + //We got common properties for the coming channels
1.171 + //Debug.Write(common.ToString());
1.172 +
1.173 + //Now get all the channels for that frequency
1.174 + //Channel common = new Channel();
1.175 + */
1.176 +
1.177 + CQ channelsTableRows = frq.Cq().Find("tbody").Children("tr");
1.178 +
1.179 +
1.180 + //CQ channelsDiv = frq.Cq().Next("div");
1.181 + //CQ channelsTableRows = channelsDiv.Find("table.fl > tbody").Children("tr");
1.182 +
1.183 + foreach (IDomObject row in channelsTableRows)
1.184 + {
1.185 + Channel channel = new Channel();
1.186 + //Initialize this channel with common properties on this frequency
1.187 + channel.Copy(common);
1.188 +
1.189 + //Try and parse channel name
1.190 + CQ cqChannelName = row.Cq().Find("td:nth-child(3) > a");
1.191 + if (cqChannelName.Length == 0)
1.192 + {
1.193 + cqChannelName = row.Cq().Find("td:nth-child(3) > i");
1.194 + if (cqChannelName.Length == 0)
1.195 + {
1.196 + //Can't get channel name
1.197 + Debug.Write("WARNING: Can't find channel name! Skipping this channel");
1.198 + continue;
1.199 + }
1.200 + }
1.201 +
1.202 + string channelName = "";
1.203 + if (cqChannelName.Get(0).HasAttribute("title") && aUseChannelIdForName)
1.204 + {
1.205 + //We want to use the channel ID
1.206 + channelName = cqChannelName.Get(0).GetAttribute("title");
1.207 + }
1.208 + else
1.209 + {
1.210 + channelName = cqChannelName.Get(0).InnerText;
1.211 + }
1.212 +
1.213 + //Decode HTML
1.214 + channel.Name = WebUtility.HtmlDecode(channelName);
1.215 + //Convert from default encoding to UTF8
1.216 + //We spend a lot of time trying to get this right until we found our answer in the following thread.
1.217 + //http://stackoverflow.com/questions/14057434/how-can-i-transform-string-to-utf-8-in-c
1.218 + byte[] bytes = Encoding.Default.GetBytes(channel.Name);
1.219 + channel.Name = Encoding.UTF8.GetString(bytes);
1.220 +
1.221 +
1.222 +
1.223 + if (channel.Name == "Name" || channel.Name == "Sorted by name")
1.224 + {
1.225 + //Skipping header rows
1.226 + continue;
1.227 + }
1.228 +
1.229 + //Make sure our channel name looks descent
1.230 + channel.Name = CleanChannelName(channel.Name);
1.231 + //Make sure the resulting name is unique to avoid having multiple tuning detail for a single channel
1.232 + if (names.ContainsKey(channel.Name))
1.233 + {
1.234 + names[channel.Name]++;
1.235 + channel.Name += " " + names[channel.Name];
1.236 + }
1.237 + else
1.238 + {
1.239 + names.Add(channel.Name, 1);
1.240 + }
1.241 +
1.242 + //
1.243 + //We don't want channels we already have
1.244 + Channel existingChannel = aChannels.Find(c => c.Name == channel.Name);
1.245 + if (existingChannel != null)
1.246 + {
1.247 + continue;
1.248 + }
1.249 +
1.250 +
1.251 + //So we have a channel name get the other properties then
1.252 + channel.Country = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(4)").Get(0).InnerText).Trim();
1.253 + channel.Category = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(5)").Get(0).InnerText).Trim();
1.254 + if (channel.Category == "")
1.255 + {
1.256 + channel.Category = "Other";
1.257 + }
1.258 +
1.259 + //Override category if needed
1.260 + if (aCategoryOverride != "")
1.261 + {
1.262 + channel.Category = aCategoryOverride;
1.263 + }
1.264 +
1.265 + //Skip the packages
1.266 + //Skip the encryptions
1.267 + channel.SID = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(8)").Get(0).InnerText).Trim();
1.268 + channel.VPID = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(9)").Get(0).InnerText).Trim();
1.269 + //Skip audios
1.270 + channel.PMT = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim();
1.271 + channel.PCR = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim();
1.272 + channel.TXT = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim();
1.273 +
1.274 + //Append that new channel to our list
1.275 + channels.Add(channel);
1.276 +
1.277 + //Show it in debug output
1.278 + Debug.Write(channel);
1.279 + } //For each channel
1.280 + } //For each frequency
1.281 +
1.282 + return channels;
1.283 + }
1.284 +
1.285 + //
1.286 + public static string CleanChannelName(string aName)
1.287 + {
1.288 + aName = aName.Trim();
1.289 + string[] remove = { " Germany", " Deutschland", " (Germany)", " (Deutschland)" };
1.290 +
1.291 + foreach (string item in remove)
1.292 + {
1.293 + //if (aName.EndsWith(item))
1.294 + if (aName.Contains(item))
1.295 + {
1.296 + aName = aName.Substring(0, aName.LastIndexOf(item));
1.297 + break; //only allow one match at most
1.298 + }
1.299 + }
1.300 +
1.301 + string[] removePrefix = { "Id: " };
1.302 +
1.303 + foreach (string item in removePrefix)
1.304 + {
1.305 + if (aName.StartsWith(item))
1.306 + {
1.307 + aName = aName.Substring(item.Length, aName.Length - item.Length);
1.308 + break; //only allow one match at most
1.309 + }
1.310 + }
1.311 +
1.312 +
1.313 +
1.314 + aName = aName.Trim();
1.315 + return aName;
1.316 + }
1.317 +
1.318 + //
1.319 + public static List<Channel> CleanChannelList(List<Channel> aChannels)
1.320 + {
1.321 + //Create our list of channels
1.322 + List<Channel> channels = new List<Channel>();
1.323 +
1.324 + foreach (Channel channel in aChannels)
1.325 + {
1.326 + Channel hdChannel = aChannels.Find(c => c.Name == channel.Name + " HD");
1.327 + if (hdChannel == null
1.328 + && !(channel.Name.Contains("Bundesliga") && !channel.Name.Contains("HD")) //We don't want non HD bundesliga
1.329 + && !(channel.Name.StartsWith("Sky Sport") && !channel.Name.Contains("HD")) //We don't want non HD Sky Sport
1.330 + )
1.331 + {
1.332 +
1.333 + if (channel.Category == "Allgemein"
1.334 + && channel.Name.Contains("Sky"))
1.335 + {
1.336 + channel.Category = "Movies & Series";
1.337 + }
1.338 +
1.339 + if (channel.Name == "SYFY HD")
1.340 + {
1.341 + channel.Category = "Movies & Series";
1.342 + }
1.343 +
1.344 + // Patch Bundesliga channel names by removing Sport, cause they are way too long names
1.345 + if (channel.Name.Contains("Bundesliga"))
1.346 + {
1.347 + channel.Name = channel.Name.Replace("Sport ", "");
1.348 + }
1.349 +
1.350 + //Patch some missing or bad categories
1.351 + if (channel.Name.Contains("Bundesliga")
1.352 + || channel.Name.Contains("Sport"))
1.353 + {
1.354 + channel.Category = "Sport";
1.355 + }
1.356 +
1.357 + if (channel.Name.Contains("Sky Select"))
1.358 + {
1.359 + channel.Category = "Pay per view";
1.360 + }
1.361 +
1.362 + if (channel.Name.Contains("TNT")
1.363 + || channel.Name.Contains("13th"))
1.364 + {
1.365 + channel.Category = "Movies & Series";
1.366 + }
1.367 +
1.368 + if (channel.Category =="Kinderprogramm")
1.369 + {
1.370 + channel.Category = "Kids";
1.371 + }
1.372 +
1.373 + if (channel.Category == "Hinweistafel")
1.374 + {
1.375 + channel.Category = "General";
1.376 + }
1.377 +
1.378 + if (channel.Name.StartsWith("Sky Atlantic")
1.379 + || channel.Name.StartsWith("SyFy")
1.380 + || channel.Name.StartsWith("Fox"))
1.381 + {
1.382 + channel.Category = "Series";
1.383 + }
1.384 +
1.385 + //Collapse some categories
1.386 + if (channel.Category == "Entertainment"
1.387 + || channel.Category == "Kultur"
1.388 + || channel.Category == "Verschiedenes")
1.389 + {
1.390 + channel.Category = "General";
1.391 + }
1.392 +
1.393 + if (channel.Category == "Musik"
1.394 + || channel.Name.Contains("Music")
1.395 + || channel.Name.Contains("Musik"))
1.396 + {
1.397 + channel.Category = "Music";
1.398 + }
1.399 +
1.400 +
1.401 +
1.402 + if (channel.Category == "Porn"
1.403 + || channel.Category == "Erotik"
1.404 + || channel.Name.Contains("Blue Movie")
1.405 + || Regex.IsMatch(channel.Name,"Sex", RegexOptions.IgnoreCase)
1.406 + || Regex.IsMatch(channel.Name, "Erotik", RegexOptions.IgnoreCase)
1.407 + || Regex.IsMatch(channel.Name, "Girl", RegexOptions.IgnoreCase)
1.408 + || Regex.IsMatch(channel.Name, "Eros", RegexOptions.IgnoreCase)
1.409 + || Regex.IsMatch(channel.Name, "Gay", RegexOptions.IgnoreCase)
1.410 + || Regex.IsMatch(channel.Name, "frauen", RegexOptions.IgnoreCase)
1.411 + || Regex.IsMatch(channel.Name, "Maenner", RegexOptions.IgnoreCase)
1.412 + || Regex.IsMatch(channel.Name, "bunny", RegexOptions.IgnoreCase)
1.413 + || Regex.IsMatch(channel.Name, "date", RegexOptions.IgnoreCase)
1.414 + )
1.415 + {
1.416 + channel.Category = "Erotic";
1.417 + }
1.418 +
1.419 + if (channel.Category == "Presentations"
1.420 + || channel.Category == "Nachrichten")
1.421 + {
1.422 + channel.Category = "News";
1.423 + }
1.424 +
1.425 + if (channel.Category == "History"
1.426 + || channel.Category == "Dokus / Reportagen")
1.427 + {
1.428 + channel.Category = "Documentaries";
1.429 + }
1.430 +
1.431 + if (channel.Category == "Travel"
1.432 + || channel.Category == "Urlaub / Reisen")
1.433 + {
1.434 + channel.Category = "Documentaries";
1.435 + }
1.436 +
1.437 +
1.438 + if (channel.Category == "Lifestyle"
1.439 + || channel.Category == "Allgemein"
1.440 + || channel.Category == "Other"
1.441 + || channel.Category == "Cultural")
1.442 + {
1.443 + channel.Category = "General";
1.444 + }
1.445 +
1.446 + if (channel.Category == "Movies"
1.447 + || channel.Category == "Spielfilme")
1.448 + {
1.449 + channel.Category = "Movies & Series";
1.450 + }
1.451 +
1.452 + if (channel.Category == "Series")
1.453 + {
1.454 + channel.Category = "Movies & Series";
1.455 + }
1.456 +
1.457 + if (channel.Category == "Regional Programm")
1.458 + {
1.459 + channel.Category = "Regional";
1.460 + }
1.461 +
1.462 + //No corresponding HD channel, keep it then
1.463 + channels.Add(channel);
1.464 + }
1.465 + else
1.466 + {
1.467 + Debug.Write("WARNING: Found HD channel for " + channel.Name + ". Discarding it!\n");
1.468 + }
1.469 + }
1.470 +
1.471 + return channels;
1.472 + }
1.473 }
1.474 }