SatIndex.cs
author StephaneLenclud
Sat, 06 Oct 2018 14:07:31 +0200
changeset 9 b77b09f680e7
parent 8 adff2dec03a0
permissions -rw-r--r--
SatIndex grabber now working.
     1 using CsQuery;
     2 using System;
     3 using System.Collections.Generic;
     4 using System.Diagnostics;
     5 using System.Linq;
     6 using System.Net;
     7 using System.Text;
     8 using System.Text.RegularExpressions;
     9 using System.Threading.Tasks;
    10 
    11 namespace SatChanGen
    12 {
    13     class SatIndex
    14     {
    15         public static Channel ParseChannel(string aUrl)
    16         {
    17             Channel channel = new Channel();
    18 
    19             string satIndex = new WebClient().DownloadString(aUrl);
    20             //Debug.Write(satIndex);
    21             CQ dom = satIndex;
    22 
    23             channel.Name = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Name')").Next().Text());
    24             //Convert from default encoding to UTF8
    25             //We spend a lot of time trying to get this right until we found our answer in the following thread.
    26             //http://stackoverflow.com/questions/14057434/how-can-i-transform-string-to-utf-8-in-c
    27             //byte[] bytes = Encoding.Default.GetBytes(channel.Name);
    28             //channel.Name = Encoding.UTF8.GetString(bytes);
    29             //
    30             channel.Satellite = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Satellit')").Next().Text());
    31             channel.OrbitalPosition = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Position')").Next().Text());
    32             // Frequency, remove dots and unit
    33             channel.Frequency = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Frequenz')").Next().Text());
    34             channel.Frequency = channel.Frequency.Replace(" MHz", "").Replace(".", "");
    35             // Just get 'H' or 'V' I guess
    36             channel.Polarisation = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Polarisation')").Next().Text()).Substring(0,1);
    37             channel.Transponder = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Transponder:')").Next().Text());
    38             channel.TransponderID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Transponder ID')").Next().Text()).Replace(".", "");
    39             channel.Beam = "Astra";
    40             channel.Standard = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Art')").Next().Text());
    41             channel.Modulation = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Modulation')").Next().Text());
    42             channel.SymbolRate = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Symbolrate')").Next().Text());
    43             channel.SymbolRate = channel.SymbolRate.Replace(" kSym/s", "").Replace(".", "");
    44             channel.FEC = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('FEC')").Next().Text());
    45             channel.Provider = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Provider')").Next().Text());
    46 
    47             channel.Bitrate = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Video Bitrate')").Next().Text());
    48             channel.NetworkID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Netzwerk ID')").Next().Text());
    49 
    50             channel.SID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Service ID')").Next().Text());
    51             channel.VPID = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Video Pid')").Next().Text());
    52             channel.PCR = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('PCR Pid')").Next().Text());
    53             channel.PMT = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('PMT Pid')").Next().Text());
    54             channel.TXT = WebUtility.HtmlDecode(dom.Find("td.chdet1:contains('Videotext Pid')").Next().Text()).Replace(" (kein Videotext)","");
    55 
    56             // We should get 4 entries:
    57             // - Category
    58             // - Country
    59             // - HD/SD TV
    60             // - Free/Pay TV
    61             CQ properties = dom.Find(".standart2");
    62             channel.Category = WebUtility.HtmlDecode(properties[0].InnerText).Trim();
    63             channel.Country = WebUtility.HtmlDecode(properties[1].InnerText).Trim();
    64 
    65             return channel;
    66         }
    67 
    68 
    69         //*[@id="container"]/div[2]/table[3]/tbody/tr/td/table[2]/tbody/tr/td
    70         public static List<Channel> Parse(IProgress<ProgressReport> aProgress, List<Channel> aChannels, string aUrl, string aOrbitalPosition, bool aUseChannelIdForName = false, string aCategoryOverride = "")
    71         {
    72             //Create our list of channels
    73             List<Channel> channels = new List<Channel>();
    74             //To avoid duplicated name
    75             Dictionary<string, int> names = new Dictionary<string, int>();
    76 
    77             string satIndex = new WebClient().DownloadString(aUrl);
    78             //Debug.Write(satIndex);
    79 
    80             CQ dom = satIndex;
    81 
    82             CQ channelsTd = dom.Find(".freq1");
    83 
    84             ProgressReport report = new ProgressReport();
    85             report.Max = channelsTd.Count();
    86             report.Value = 0;
    87             aProgress.Report(report);
    88 
    89             foreach ( IDomObject td in channelsTd)
    90             {
    91                 string channelUrl = "https://www.satindex.de" + td.FirstChild.GetAttribute("href");
    92 
    93                 Channel channel = ParseChannel(channelUrl);
    94 
    95                 //Make sure our channel name looks descent
    96                 channel.Name = CleanChannelName(channel.Name);
    97                 //Make sure the resulting name is unique to avoid having multiple tuning detail for a single channel
    98                 if (names.ContainsKey(channel.Name))
    99                 {
   100                     names[channel.Name]++;
   101                     channel.Name += " " + names[channel.Name];
   102                 }
   103                 else
   104                 {
   105                     names.Add(channel.Name, 1);
   106                 }
   107                 
   108                 // Add it to our collection
   109                 channels.Add(channel);
   110                 // Report progress
   111                 report.Value++;
   112                 aProgress.Report(report);
   113             }
   114 
   115             return channels;
   116 
   117 
   118             //Get all the Frequency tables in our page
   119             // Why is this not working? 
   120             //CQ sats = dom["#container > div:nth-child(2) > table:nth-child(16) > tbody > tr > td > table:nth-child(8) > tbody > tr > td > table"];
   121             // As a workaround we did the following
   122             CQ sats = dom["#container"]["div:nth-child(2)"]["table:nth-child(16)"]["tbody"]["tr"]["td"]["table:nth-child(8)"]["tbody"]["tr"]["td"]["table"];
   123 
   124             List<IDomObject> transponders = sats.ToList();
   125 
   126 
   127 
   128             foreach (IDomObject frq in transponders)
   129             {
   130                 Channel common = new Channel();
   131 
   132                 //Parse channel details
   133                 //common.OrbitalPosition = aOrbitalPosition;
   134                 //string wsm1 = WebUtility.HtmlDecode(frq.Cq().Find(".wsm1").Get(0).InnerText).Trim();
   135 
   136                 /*
   137                 common.Satellite = "Astra 19.2° East";
   138                 common.Frequency = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(3)").Get(0).InnerText);
   139                 common.Polarisation = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(4)").Get(0).InnerText);
   140                 common.Transponder = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(5) > a").Get(0).InnerText);
   141                 common.Beam = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(6) > a").Get(0).InnerText);
   142                 common.Standard = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(7)").Get(0).InnerText);
   143                 common.Modulation = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(8)").Get(0).InnerText);
   144                 common.SymbolRate = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(9) > a").Get(0).InnerText);
   145                 common.FEC = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(9) > a:nth-child(2)").Get(0).InnerText);
   146                 try
   147                 {
   148                     common.Provider = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(10) > b").Get(0).InnerText);
   149                 }
   150                 catch (Exception)
   151                 {
   152                 }
   153 
   154                 common.Bitrate = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(10)").Get(0).InnerText);
   155                 if (common.Bitrate.Substring(0, ", ".Length) == ", ")
   156                 {
   157                     common.Bitrate = common.Bitrate.Substring(", ".Length, common.Bitrate.Length - ", ".Length);
   158                 }
   159                 //
   160                 common.NetworkID = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(11)").Get(0).InnerText);
   161                 //common.NetworkID = common.NetworkID.Substring("NID:".Length, common.NetworkID.Length - "NID:".Length);
   162                 //
   163                 common.TransponderID = WebUtility.HtmlDecode(frq.Cq().Find("tbody > tr > td:nth-child(12)").Get(0).InnerText);
   164                 //common.TransponderID = common.TransponderID.Substring("TID:".Length, common.TransponderID.Length - "TID:".Length);
   165 
   166                 //We got common properties for the coming channels
   167                 //Debug.Write(common.ToString());
   168 
   169                 //Now get all the channels for that frequency
   170                 //Channel common = new Channel();
   171                 */
   172 
   173                 CQ channelsTableRows = frq.Cq().Find("tbody").Children("tr");
   174 
   175 
   176                 //CQ channelsDiv = frq.Cq().Next("div");
   177                 //CQ channelsTableRows = channelsDiv.Find("table.fl > tbody").Children("tr");
   178 
   179                 foreach (IDomObject row in channelsTableRows)
   180                 {
   181                     Channel channel = new Channel();
   182                     //Initialize this channel with common properties on this frequency
   183                     channel.Copy(common);
   184 
   185                     //Try and parse channel name
   186                     CQ cqChannelName = row.Cq().Find("td:nth-child(3) > a");
   187                     if (cqChannelName.Length == 0)
   188                     {
   189                         cqChannelName = row.Cq().Find("td:nth-child(3) > i");
   190                         if (cqChannelName.Length == 0)
   191                         {
   192                             //Can't get channel name
   193                             Debug.Write("WARNING: Can't find channel name! Skipping this channel");
   194                             continue;
   195                         }
   196                     }
   197 
   198                     string channelName = "";
   199                     if (cqChannelName.Get(0).HasAttribute("title") && aUseChannelIdForName)
   200                     {
   201                         //We want to use the channel ID
   202                         channelName = cqChannelName.Get(0).GetAttribute("title");
   203                     }
   204                     else
   205                     {
   206                         channelName = cqChannelName.Get(0).InnerText;
   207                     }
   208 
   209                     //Decode HTML
   210                     channel.Name = WebUtility.HtmlDecode(channelName);
   211                     //Convert from default encoding to UTF8
   212                     //We spend a lot of time trying to get this right until we found our answer in the following thread.
   213                     //http://stackoverflow.com/questions/14057434/how-can-i-transform-string-to-utf-8-in-c
   214                     byte[] bytes = Encoding.Default.GetBytes(channel.Name);
   215                     channel.Name = Encoding.UTF8.GetString(bytes);
   216 
   217 
   218 
   219                     if (channel.Name == "Name" || channel.Name == "Sorted by name")
   220                     {
   221                         //Skipping header rows
   222                         continue;
   223                     }
   224 
   225                     //Make sure our channel name looks descent
   226                     channel.Name = CleanChannelName(channel.Name);
   227                     //Make sure the resulting name is unique to avoid having multiple tuning detail for a single channel
   228                     if (names.ContainsKey(channel.Name))
   229                     {
   230                         names[channel.Name]++;
   231                         channel.Name += " " + names[channel.Name];
   232                     }
   233                     else
   234                     {
   235                         names.Add(channel.Name, 1);
   236                     }
   237 
   238                     //
   239                     //We don't want channels we already have
   240                     Channel existingChannel = aChannels.Find(c => c.Name == channel.Name);
   241                     if (existingChannel != null)
   242                     {
   243                         continue;
   244                     }
   245 
   246 
   247                     //So we have a channel name get the other properties then
   248                     channel.Country = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(4)").Get(0).InnerText).Trim();
   249                     channel.Category = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(5)").Get(0).InnerText).Trim();
   250                     if (channel.Category == "")
   251                     {
   252                         channel.Category = "Other";
   253                     }
   254 
   255                     //Override category if needed
   256                     if (aCategoryOverride != "")
   257                     {
   258                         channel.Category = aCategoryOverride;
   259                     }
   260 
   261                     //Skip the packages
   262                     //Skip the encryptions
   263                     channel.SID = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(8)").Get(0).InnerText).Trim();
   264                     channel.VPID = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(9)").Get(0).InnerText).Trim();
   265                     //Skip audios
   266                     channel.PMT = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim();
   267                     channel.PCR = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim();
   268                     channel.TXT = WebUtility.HtmlDecode(row.Cq().Find("td:nth-child(11)").Get(0).InnerText).Trim();
   269 
   270                     //Append that new channel to our list
   271                     channels.Add(channel);
   272 
   273                     //Show it in debug output
   274                     Debug.Write(channel);
   275                 } //For each channel
   276             } //For each frequency
   277 
   278             return channels;
   279         }
   280 
   281         //
   282         public static string CleanChannelName(string aName)
   283         {
   284             aName = aName.Trim();
   285             string[] remove = { " Germany", " Deutschland", " (Germany)", " (Deutschland)" };
   286 
   287             foreach (string item in remove)
   288             {
   289                 //if (aName.EndsWith(item))
   290                 if (aName.Contains(item))
   291                 {
   292                     aName = aName.Substring(0, aName.LastIndexOf(item));
   293                     break; //only allow one match at most
   294                 }
   295             }
   296 
   297             string[] removePrefix = { "Id: " };
   298 
   299             foreach (string item in removePrefix)
   300             {
   301                 if (aName.StartsWith(item))
   302                 {
   303                     aName = aName.Substring(item.Length, aName.Length - item.Length);
   304                     break; //only allow one match at most
   305                 }
   306             }
   307 
   308 
   309 
   310             aName = aName.Trim();
   311             return aName;
   312         }
   313 
   314         //
   315         public static List<Channel> CleanChannelList(List<Channel> aChannels)
   316         {
   317             //Create our list of channels
   318             List<Channel> channels = new List<Channel>();
   319 
   320             foreach (Channel channel in aChannels)
   321             {
   322                 Channel hdChannel = aChannels.Find(c => c.Name == channel.Name + " HD");
   323                 if (hdChannel == null
   324                     && !(channel.Name.Contains("Bundesliga") && !channel.Name.Contains("HD")) //We don't want non HD bundesliga
   325                     && !(channel.Name.StartsWith("Sky Sport") && !channel.Name.Contains("HD")) //We don't want non HD Sky Sport
   326                     )
   327                 {
   328 
   329                     if (channel.Category == "Allgemein"
   330                         && channel.Name.Contains("Sky"))
   331                     {
   332                         channel.Category = "Movies & Series";
   333                     }
   334 
   335                     if (channel.Name == "SYFY HD")
   336                     {
   337                         channel.Category = "Movies & Series";
   338                     }
   339 
   340                     // Patch Bundesliga channel names by removing Sport, cause they are way too long names
   341                     if (channel.Name.Contains("Bundesliga"))
   342                     {
   343                         channel.Name = channel.Name.Replace("Sport ", "");
   344                     }
   345 
   346                     //Patch some missing or bad categories
   347                     if (channel.Name.Contains("Bundesliga")
   348                         || channel.Name.Contains("Sport"))
   349                     {
   350                         channel.Category = "Sport";
   351                     }
   352 
   353                     if (channel.Name.Contains("Sky Select"))
   354                     {
   355                         channel.Category = "Pay per view";
   356                     }
   357 
   358                     if (channel.Name.Contains("TNT")
   359                         || channel.Name.Contains("13th"))
   360                     {
   361                         channel.Category = "Movies & Series";
   362                     }
   363 
   364                     if (channel.Category =="Kinderprogramm")
   365                     {
   366                         channel.Category = "Kids";
   367                     }
   368 
   369                     if (channel.Category == "Hinweistafel")
   370                     {
   371                         channel.Category = "General";
   372                     }
   373 
   374                     if (channel.Name.StartsWith("Sky Atlantic")
   375                         || channel.Name.StartsWith("SyFy")
   376                         || channel.Name.StartsWith("Fox"))
   377                     {
   378                         channel.Category = "Series";
   379                     }
   380 
   381                     //Collapse some categories
   382                     if (channel.Category == "Entertainment"                        
   383                         || channel.Category == "Kultur"
   384                         || channel.Category == "Verschiedenes")
   385                     {
   386                         channel.Category = "General";
   387                     }
   388 
   389                     if (channel.Category == "Musik"
   390                         || channel.Name.Contains("Music")
   391                         || channel.Name.Contains("Musik"))
   392                     {
   393                         channel.Category = "Music";
   394                     }
   395 
   396 
   397 
   398                     if (channel.Category == "Porn"
   399                         || channel.Category == "Erotik"
   400                         || channel.Name.Contains("Blue Movie")
   401                         || Regex.IsMatch(channel.Name,"Sex", RegexOptions.IgnoreCase)
   402                         || Regex.IsMatch(channel.Name, "Erotik", RegexOptions.IgnoreCase)
   403                         || Regex.IsMatch(channel.Name, "Girl", RegexOptions.IgnoreCase)
   404                         || Regex.IsMatch(channel.Name, "Eros", RegexOptions.IgnoreCase)
   405                         || Regex.IsMatch(channel.Name, "Gay", RegexOptions.IgnoreCase)
   406                         || Regex.IsMatch(channel.Name, "frauen", RegexOptions.IgnoreCase)
   407                         || Regex.IsMatch(channel.Name, "Maenner", RegexOptions.IgnoreCase)
   408                         || Regex.IsMatch(channel.Name, "bunny", RegexOptions.IgnoreCase)
   409                         || Regex.IsMatch(channel.Name, "date", RegexOptions.IgnoreCase)
   410                         )
   411                     {
   412                         channel.Category = "Erotic";
   413                     }
   414 
   415                     if (channel.Category == "Presentations"
   416                         || channel.Category == "Nachrichten")
   417                     {
   418                         channel.Category = "News";
   419                     }
   420 
   421                     if (channel.Category == "History"
   422                         || channel.Category == "Dokus / Reportagen")
   423                     {
   424                         channel.Category = "Documentaries";
   425                     }
   426 
   427                     if (channel.Category == "Travel"
   428                         || channel.Category == "Urlaub / Reisen")
   429                     {
   430                         channel.Category = "Documentaries";
   431                     }
   432 
   433 
   434                     if (channel.Category == "Lifestyle"
   435                         || channel.Category == "Allgemein"
   436                         || channel.Category == "Other"
   437                         || channel.Category == "Cultural")
   438                     {
   439                         channel.Category = "General";
   440                     }
   441 
   442                     if (channel.Category == "Movies"
   443                         || channel.Category == "Spielfilme")
   444                     {
   445                         channel.Category = "Movies & Series";
   446                     }
   447 
   448                     if (channel.Category == "Series")
   449                     {
   450                         channel.Category = "Movies & Series";
   451                     }
   452 
   453                     if (channel.Category == "Regional Programm")
   454                     {
   455                         channel.Category = "Regional";
   456                     }
   457 
   458                     //No corresponding HD channel, keep it then
   459                     channels.Add(channel);
   460                 }
   461                 else
   462                 {
   463                     Debug.Write("WARNING: Found HD channel for " + channel.Name + ". Discarding it!\n");
   464                 }
   465             }
   466 
   467             return channels;
   468         }
   469     }
   470 }