2
0

html_pas.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637
  1. import re
  2. import html
  3. def html_pas(data):
  4. data = re.sub('%H%', '<', data)
  5. data = re.sub('%\/H%', '>', data)
  6. d_list = re.findall('<(\/)?([^> ]+)( (?:[^>]+)?)?>', data)
  7. for i_list in d_list:
  8. if(i_list[0] == ''):
  9. if(i_list[1] in ['div', 'span', 'embed', 'iframe']):
  10. if(re.search('<\/' + i_list[1] + '>', data)):
  11. src = re.search('src=([^ ]*)', i_list[2])
  12. if(src):
  13. v_src = re.search('http(?:s)?:\/\/([^/\'" ]*)', src.groups()[0])
  14. if(v_src):
  15. if(not v_src.groups()[0] in ["www.youtube.com", "serviceapi.nmv.naver.com", "tv.kakao.com", "www.google.com", "serviceapi.rmcnmv.naver.com"]):
  16. ot = re.sub('src=([^ ]*)', '', i_list[2])
  17. else:
  18. ot = i_list[2]
  19. else:
  20. ot = re.sub('src=([^ ]*)', '', i_list[2])
  21. else:
  22. ot = i_list[2]
  23. po = re.compile('position', re.I)
  24. data = data.replace('<' + i_list[1] + i_list[2] + '>', '%H%' + i_list[1] + po.sub('', ot) + '%/H%', 1)
  25. data = re.sub('<\/' + i_list[1] + '>', '%H%/' + i_list[1] + '%/H%', data, 1)
  26. data = html.escape(data)
  27. data = data.replace('\\', '&#92;')
  28. end = re.findall('%H%((?:(?!%/H%).)*)%/H%', data)
  29. for d_end in end:
  30. data = re.sub('%H%((?:(?!%/H%).)*)%/H%', '<' + re.sub('&quot;', '"', re.sub('&#x27;', "'", d_end)) + '>', data, 1)
  31. return(data)