import logging import re from datetime import date, datetime, timedelta from typing import List from urllib.parse import unquote from bs4 import SoupStrainer from epg2xml.providers import EPGProgram, EPGProvider, no_endtime from epg2xml.utils import ParserBeautifulSoup as BeautifulSoup log = logging.getLogger(__name__.rsplit(".", maxsplit=1)[-1].upper()) CH_CATE = [ # 0은 전체 채널 {"id": "1", "name": "UHD"}, {"id": "3", "name": "홍보"}, {"id": "4", "name": "지상파"}, {"id": "5", "name": "홈쇼핑"}, {"id": "6", "name": "종합편성"}, {"id": "8", "name": "드라마/버라이어티"}, {"id": "10", "name": "오락/음악"}, {"id": "12", "name": "영화/시리즈"}, {"id": "137", "name": "스포츠"}, {"id": "206", "name": "취미/레저"}, {"id": "317", "name": "애니/유아"}, {"id": "442", "name": "교육"}, {"id": "446", "name": "다큐/교양"}, {"id": "447", "name": "뉴스/경제"}, {"id": "448", "name": "공공/공익/정보"}, {"id": "449", "name": "종교"}, {"id": "491", "name": "오픈채널"}, {"id": "507", "name": "유료"}, {"id": "508", "name": "오디오"}, ] PTN_RATING = re.compile(r"([\d,]+)") class KT(EPGProvider): """EPGProvider for KT 데이터: rawhtml 요청수: #channels * #days 특이사항: - 가끔 업데이트 지연 - 프로그램 시작 시각만 제공 """ referer = "https://tv.kt.com/" title_regex = r"^(?P.*?)\s?([\<\(]?(?P<part>\d+)부[\>\)]?)?$" def get_svc_channels(self) -> List[dict]: svc_channels = [] url = "https://tv.kt.com/tv/channel/pChList.asp" params = {"ch_type": "1", "parent_menu_id": "0"} for c in CH_CATE: params.update({"parent_menu_id": c["id"]}) soup = BeautifulSoup(self.request(url, method="POST", data=params)) raw_channels = [unquote(x.find("span", {"class": "ch"}).text.strip()) for x in soup.select("li > a")] # 몇몇 채널은 (TV로만 제공, 유료채널) 웹에서 막혀있지만 실제로는 데이터가 있을 수 있다. for x in raw_channels: svc_channels.append( { "Name": " ".join(x.split()[1:]), "No": str(x.split()[0]), "ServiceId": x.split()[0], "Category": c["name"], } ) return svc_channels @no_endtime def get_programs(self) -> None: url = "https://tv.kt.com/tv/channel/pSchedule.asp" params = { "ch_type": "1", # 1: live 2: skylife 3: uhd live 4: uhd skylife "view_type": "1", # 1: daily 2: weekly "service_ch_no": "SVCID", "seldate": "EPGDATE", } for idx, _ch in enumerate(self.req_channels): log.info("%03d/%03d %s", idx + 1, len(self.req_channels), _ch) for nd in range(int(self.cfg["FETCH_LIMIT"])): day = date.today() + timedelta(days=nd) params.update({"service_ch_no": _ch.svcid, "seldate": day.strftime("%Y%m%d")}) data = self.request(url, method="POST", data=params) try: _epgs = self.__epgs_of_day(_ch.id, data, day) except Exception: log.exception("프로그램 파싱 중 예외: %s, %s", _ch, day) else: _ch.programs.extend(_epgs) def __epgs_of_day(self, channelid: str, data: str, day: datetime) -> List[EPGProgram]: _epgs = [] soup = BeautifulSoup(unquote(data), parse_only=SoupStrainer("tbody")) for row in soup.find_all("tr"): cell = row.find_all("td") hour = cell[0].text.strip() for minute, program, category in zip(*[c.find_all("p") for c in cell[1:]]): _epg = EPGProgram(channelid) _epg.stime = datetime.strptime(f"{day} {hour}:{minute.text.strip()}", "%Y-%m-%d %H:%M") _epg.title = program.text.replace("방송중 ", "").strip() if m := self.title_regex.match(_epg.title): _epg.title = m.group("title") if part_num := m.group("part"): _epg.part_num = part_num _epg.title += f" ({_epg.part_num}부)" _epg.categories = [category.text.strip()] for image in program.find_all("img", alt=True): if "시청 가능" not in (alt := image["alt"]): continue grade = PTN_RATING.match(alt) _epg.rating = int(grade.group(1)) if grade else 0 _epgs.append(_epg) return _epgs