diff -r adff2dec03a0 -r b77b09f680e7 SatIndex.cs --- a/SatIndex.cs Thu Oct 04 19:25:35 2018 +0200 +++ b/SatIndex.cs Sat Oct 06 14:07:31 2018 +0200 @@ -1,12 +1,470 @@ -using System; +using CsQuery; +using System; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; +using System.Net; using System.Text; +using System.Text.RegularExpressions; using System.Threading.Tasks; namespace SatChanGen { class SatIndex { + public static Channel ParseChannel(string aUrl) + { + Channel channel = new Channel(); + + string satIndex = new WebClient().DownloadString(aUrl); + //Debug.Write(satIndex); + CQ dom = satIndex; + + channel.Name = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Name')").Next().Text()); + //Convert from default encoding to UTF8 + //We spend a lot of time trying to get this right until we found our answer in the following thread. + //http://stackoverflow.com/questions/14057434/how-can-i-transform-string-to-utf-8-in-c + //byte[] bytes = Encoding.Default.GetBytes(channel.Name); + //channel.Name = Encoding.UTF8.GetString(bytes); + // + channel.Satellite = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Satellit')").Next().Text()); + channel.OrbitalPosition = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Position')").Next().Text()); + // Frequency, remove dots and unit + channel.Frequency = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Frequenz')").Next().Text()); + channel.Frequency = channel.Frequency.Replace(" MHz", "").Replace(".", ""); + // Just get 'H' or 'V' I guess + channel.Polarisation = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Polarisation')").Next().Text()).Substring(0,1); + channel.Transponder = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Transponder:')").Next().Text()); + channel.TransponderID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Transponder ID')").Next().Text()).Replace(".", ""); + channel.Beam = "Astra"; + channel.Standard = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Art')").Next().Text()); + channel.Modulation = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Modulation')").Next().Text()); + channel.SymbolRate = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Symbolrate')").Next().Text()); + channel.SymbolRate = channel.SymbolRate.Replace(" kSym/s", "").Replace(".", ""); + channel.FEC = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('FEC')").Next().Text()); + channel.Provider = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Provider')").Next().Text()); + + channel.Bitrate = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Video Bitrate')").Next().Text()); + channel.NetworkID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Netzwerk ID')").Next().Text()); + + channel.SID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Service ID')").Next().Text()); + channel.VPID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Video Pid')").Next().Text()); + channel.PCR = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('PCR Pid')").Next().Text()); + channel.PMT = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('PMT Pid')").Next().Text()); + channel.TXT = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Videotext Pid')").Next().Text()).Replace(" (kein Videotext)",""); + + // We should get 4 entries: + // - Category + // - Country + // - HD/SD TV + // - Free/Pay TV + CQ properties = dom.Find(".standart2"); + channel.Category = WebUtility.HtmlDecode(properties[0].InnerText).Trim(); + channel.Country = WebUtility.HtmlDecode(properties[1].InnerText).Trim(); + + return channel; + } + + + //*[@id="container"]/div[2]/table[3]/tbody/tr/td/table[2]/tbody/tr/td + public static List Parse(IProgress aProgress, List aChannels, string aUrl, string aOrbitalPosition, bool aUseChannelIdForName = false, string aCategoryOverride = "") + { + //Create our list of channels + List channels = new List(); + //To avoid duplicated name + Dictionary names = new Dictionary(); + + string satIndex = new WebClient().DownloadString(aUrl); + //Debug.Write(satIndex); + + CQ dom = satIndex; + + CQ channelsTd = dom.Find(".freq1"); + + ProgressReport report = new ProgressReport(); + report.Max = channelsTd.Count(); + report.Value = 0; + aProgress.Report(report); + + foreach ( IDomObject td in channelsTd) + { + string channelUrl = "https://www.satindex.de" + td.FirstChild.GetAttribute("href"); + + Channel channel = ParseChannel(channelUrl); + + //Make sure our channel name looks descent + channel.Name = CleanChannelName(channel.Name); + //Make sure the resulting name is unique to avoid having multiple tuning detail for a single channel + if (names.ContainsKey(channel.Name)) + { + names[channel.Name]++; + channel.Name += " " + names[channel.Name]; + } + else + { + names.Add(channel.Name, 1); + } + + // Add it to our collection + channels.Add(channel); + // Report progress + report.Value++; + aProgress.Report(report); + } + + return channels; + + + //Get all the Frequency tables in our page + // Why is this not working? + //CQ sats = dom["#container > div:nth-child(2) > table:nth-child(16) > tbody > tr > td > table:nth-child(8) > tbody > tr > td > table"]; + // As a workaround we did the following + CQ sats = dom["#container"]["div:nth-child(2)"]["table:nth-child(16)"]["tbody"]["tr"]["td"]["table:nth-child(8)"]["tbody"]["tr"]["td"]["table"]; + + List transponders = sats.ToList(); + + + + foreach (IDomObject frq in transponders) + { + Channel common = new Channel(); + + //Parse channel details + //common.OrbitalPosition = aOrbitalPosition; + //string wsm1 = WebUtility.HtmlDecode(frq.Cq().Find(".wsm1").Get(0).InnerText).Trim(); + + /* + common.Satellite = "Astra 19.2° East"; + common.Frequency = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(3)").Get(0).InnerText); + common.Polarisation = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(4)").Get(0).InnerText); + common.Transponder = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(5) > a").Get(0).InnerText); + common.Beam = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(6) > a").Get(0).InnerText); + common.Standard = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(7)").Get(0).InnerText); + common.Modulation = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(8)").Get(0).InnerText); + common.SymbolRate = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(9) > a").Get(0).InnerText); + common.FEC = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(9) > a:nth-child(2)").Get(0).InnerText); + try + { + common.Provider = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(10) > b").Get(0).InnerText); + } + catch (Exception) + { + } + + common.Bitrate = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(10)").Get(0).InnerText); + if (common.Bitrate.Substring(0, ", ".Length) == ", ") + { + common.Bitrate = common.Bitrate.Substring(", ".Length, common.Bitrate.Length - ", ".Length); + } + // + common.NetworkID = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(11)").Get(0).InnerText); + //common.NetworkID = common.NetworkID.Substring("NID:".Length, common.NetworkID.Length - "NID:".Length); + // + common.TransponderID = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(12)").Get(0).InnerText); + //common.TransponderID = common.TransponderID.Substring("TID:".Length, common.TransponderID.Length - "TID:".Length); + + //We got common properties for the coming channels + //Debug.Write(common.ToString()); + + //Now get all the channels for that frequency + //Channel common = new Channel(); + */ + + CQ channelsTableRows = frq.Cq().Find("tbody").Children("tr"); + + + //CQ channelsDiv = frq.Cq().Next("div"); + //CQ channelsTableRows = channelsDiv.Find("table.fl > tbody").Children("tr"); + + foreach (IDomObject row in channelsTableRows) + { + Channel channel = new Channel(); + //Initialize this channel with common properties on this frequency + channel.Copy(common); + + //Try and parse channel name + CQ cqChannelName = row.Cq().Find("td:nth-child(3) > a"); + if (cqChannelName.Length == 0) + { + cqChannelName = row.Cq().Find("td:nth-child(3) > i"); + if (cqChannelName.Length == 0) + { + //Can't get channel name + Debug.Write("WARNING: Can't find channel name! Skipping this channel"); + continue; + } + } + + string channelName = ""; + if (cqChannelName.Get(0).HasAttribute("title") && aUseChannelIdForName) + { + //We want to use the channel ID + channelName = cqChannelName.Get(0).GetAttribute("title"); + } + else + { + channelName = cqChannelName.Get(0).InnerText; + } + + //Decode HTML + channel.Name = WebUtility.HtmlDecode(channelName); + //Convert from default encoding to UTF8 + //We spend a lot of time trying to get this right until we found our answer in the following thread. + //http://stackoverflow.com/questions/14057434/how-can-i-transform-string-to-utf-8-in-c + byte[] bytes = Encoding.Default.GetBytes(channel.Name); + channel.Name = Encoding.UTF8.GetString(bytes); + + + + if (channel.Name == "Name" || channel.Name == "Sorted by name") + { + //Skipping header rows + continue; + } + + //Make sure our channel name looks descent + channel.Name = CleanChannelName(channel.Name); + //Make sure the resulting name is unique to avoid having multiple tuning detail for a single channel + if (names.ContainsKey(channel.Name)) + { + names[channel.Name]++; + channel.Name += " " + names[channel.Name]; + } + else + { + names.Add(channel.Name, 1); + } + + // + //We don't want channels we already have + Channel existingChannel = aChannels.Find(c => c.Name == channel.Name); + if (existingChannel != null) + { + continue; + } + + + //So we have a channel name get the other properties then + channel.Country = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(4)").Get(0).InnerText).Trim(); + channel.Category = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(5)").Get(0).InnerText).Trim(); + if (channel.Category == "") + { + channel.Category = "Other"; + } + + //Override category if needed + if (aCategoryOverride != "") + { + channel.Category = aCategoryOverride; + } + + //Skip the packages + //Skip the encryptions + channel.SID = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(8)").Get(0).InnerText).Trim(); + channel.VPID = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(9)").Get(0).InnerText).Trim(); + //Skip audios + channel.PMT = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim(); + channel.PCR = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim(); + channel.TXT = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim(); + + //Append that new channel to our list + channels.Add(channel); + + //Show it in debug output + Debug.Write(channel); + } //For each channel + } //For each frequency + + return channels; + } + + // + public static string CleanChannelName(string aName) + { + aName = aName.Trim(); + string[] remove = { " Germany", " Deutschland", " (Germany)", " (Deutschland)" }; + + foreach (string item in remove) + { + //if (aName.EndsWith(item)) + if (aName.Contains(item)) + { + aName = aName.Substring(0, aName.LastIndexOf(item)); + break; //only allow one match at most + } + } + + string[] removePrefix = { "Id: " }; + + foreach (string item in removePrefix) + { + if (aName.StartsWith(item)) + { + aName = aName.Substring(item.Length, aName.Length - item.Length); + break; //only allow one match at most + } + } + + + + aName = aName.Trim(); + return aName; + } + + // + public static List CleanChannelList(List aChannels) + { + //Create our list of channels + List channels = new List(); + + foreach (Channel channel in aChannels) + { + Channel hdChannel = aChannels.Find(c => c.Name == channel.Name + " HD"); + if (hdChannel == null + && !(channel.Name.Contains("Bundesliga") && !channel.Name.Contains("HD")) //We don't want non HD bundesliga + && !(channel.Name.StartsWith("Sky Sport") && !channel.Name.Contains("HD")) //We don't want non HD Sky Sport + ) + { + + if (channel.Category == "Allgemein" + && channel.Name.Contains("Sky")) + { + channel.Category = "Movies & Series"; + } + + if (channel.Name == "SYFY HD") + { + channel.Category = "Movies & Series"; + } + + // Patch Bundesliga channel names by removing Sport, cause they are way too long names + if (channel.Name.Contains("Bundesliga")) + { + channel.Name = channel.Name.Replace("Sport ", ""); + } + + //Patch some missing or bad categories + if (channel.Name.Contains("Bundesliga") + || channel.Name.Contains("Sport")) + { + channel.Category = "Sport"; + } + + if (channel.Name.Contains("Sky Select")) + { + channel.Category = "Pay per view"; + } + + if (channel.Name.Contains("TNT") + || channel.Name.Contains("13th")) + { + channel.Category = "Movies & Series"; + } + + if (channel.Category =="Kinderprogramm") + { + channel.Category = "Kids"; + } + + if (channel.Category == "Hinweistafel") + { + channel.Category = "General"; + } + + if (channel.Name.StartsWith("Sky Atlantic") + || channel.Name.StartsWith("SyFy") + || channel.Name.StartsWith("Fox")) + { + channel.Category = "Series"; + } + + //Collapse some categories + if (channel.Category == "Entertainment" + || channel.Category == "Kultur" + || channel.Category == "Verschiedenes") + { + channel.Category = "General"; + } + + if (channel.Category == "Musik" + || channel.Name.Contains("Music") + || channel.Name.Contains("Musik")) + { + channel.Category = "Music"; + } + + + + if (channel.Category == "Porn" + || channel.Category == "Erotik" + || channel.Name.Contains("Blue Movie") + || Regex.IsMatch(channel.Name,"Sex", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "Erotik", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "Girl", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "Eros", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "Gay", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "frauen", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "Maenner", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "bunny", RegexOptions.IgnoreCase) + || Regex.IsMatch(channel.Name, "date", RegexOptions.IgnoreCase) + ) + { + channel.Category = "Erotic"; + } + + if (channel.Category == "Presentations" + || channel.Category == "Nachrichten") + { + channel.Category = "News"; + } + + if (channel.Category == "History" + || channel.Category == "Dokus / Reportagen") + { + channel.Category = "Documentaries"; + } + + if (channel.Category == "Travel" + || channel.Category == "Urlaub / Reisen") + { + channel.Category = "Documentaries"; + } + + + if (channel.Category == "Lifestyle" + || channel.Category == "Allgemein" + || channel.Category == "Other" + || channel.Category == "Cultural") + { + channel.Category = "General"; + } + + if (channel.Category == "Movies" + || channel.Category == "Spielfilme") + { + channel.Category = "Movies & Series"; + } + + if (channel.Category == "Series") + { + channel.Category = "Movies & Series"; + } + + if (channel.Category == "Regional Programm") + { + channel.Category = "Regional"; + } + + //No corresponding HD channel, keep it then + channels.Add(channel); + } + else + { + Debug.Write("WARNING: Found HD channel for " + channel.Name + ". Discarding it!\n"); + } + } + + return channels; + } } }