from . import tool import datetime import html import re def html_only(conn, data, title, main_num): curs = conn.cursor() backlink = [] plus_data = '' while 1: in_data = re.search('((?:(?!<\/a>).)+)<\/a>', data) if in_data: in_data = in_data.groups() if in_data[0]: main_link = in_data[0] sub_link = in_data[1] else: main_link = in_data[1] sub_link = in_data[1] curs.execute("select title from data where title = ?", [main_link]) if not curs.fetchall(): link_id = 'id="not_thing"' backlink += [[title, main_link, 'no']] else: link_id = 'id=""' backlink += [[title, main_link, '']] data = re.sub('((?:(?!<\/a>).)+)<\/a>', '' + sub_link + '', data, 1) else: break data = re.sub(').)+)>((?:(?!<\/a>).)+)<\/a>', data) if in_data: in_data = in_data.groups() a_data = re.sub('href="((?:(?!").)+)"', '', in_data[0]) a_data = re.sub('id="((?:(?!").)+)"', '', a_data) if re.search('=', a_data): data = re.sub(').)+)>((?:(?!<\/a>).)+)<\/a>', '', data, 1) else: data = re.sub(').)+)>((?:(?!<\/a>).)+)<\/a>', '' + in_data[1] + '', data, 1) else: break data = re.sub('