فهرست منبع

html_only 파서 기본편

Surplus_Up (2DU) 7 سال پیش
والد
کامیت
6c704358a4
5فایلهای تغییر یافته به همراه106 افزوده شده و 35 حذف شده
  1. 4 1
      func.py
  2. 8 2
      mark.py
  3. 59 1
      set_mark/html_only.py
  4. 1 30
      set_mark/namu.py
  5. 34 1
      set_mark/tool.py

+ 4 - 1
func.py

@@ -58,7 +58,10 @@ def render_set(title = '', data = '', num = 0):
     if acl_check(title, 'render') == 1:
         return 'http request 401.3'
     else:
-        return namumark(title, data, num)
+        curs.execute('select data from other where name = "markup"')
+        markup = curs.fetchall()
+
+        return namumark(title, data, num, markup[0][0])
 
 def captcha_get():
     data = ''

+ 8 - 2
mark.py

@@ -1,4 +1,5 @@
 from set_mark.namu import namu
+from set_mark.html_only import html_only
 
 import re
 import html
@@ -39,9 +40,14 @@ def plusing(data):
         if not curs.fetchall():
             curs.execute("insert into back (title, link, type) values (?, ?, ?)", [data_in[1], data_in[0], data_in[2]])
 
-def namumark(title = '', data = '', num = 0):
+def namumark(title = '', data = '', num = 0, markup = 'namumark'):
     if not data == '':
-        data = namu(conn, data, title, num)
+        if markup == 'namumark':
+            data = namu(conn, data, title, num)
+        elif markup == 'html':
+            data = html_only(conn, data, title, num)
+        else:
+            data = ['', '', []]
 
         if num == 1:
             data_num = len(data[2]) 

+ 59 - 1
set_mark/html_only.py

@@ -9,5 +9,63 @@ def html_only(conn, data, title, main_num):
 	
 	backlink = []
 	plus_data = ''
+
+	while 1:
+		in_data = re.search('<a(?: href="/w/((?:(?!\").)+)")?>((?:(?!<\/a>).)+)<\/a>', data)
+		if in_data:
+			in_data = in_data.groups()
+
+			if in_data[0]:
+				main_link = in_data[0]
+				sub_link = in_data[1]
+			else:
+				main_link = in_data[1]
+				sub_link = in_data[1]
+
+			curs.execute("select title from data where title = ?", [main_link])
+			if not curs.fetchall():
+				link_id = 'id="not_thing"'
+			
+				backlink += [[title, main_link, 'no']]
+			else:
+				link_id = 'id=""'
+
+			backlink += [[title, main_link, '']]
+
+			data = re.sub('<a(?: href="/((?:(?!\").)+)")?>((?:(?!<\/a>).)+)<\/a>', '<a ' + link_id + ' href="/w/' + main_link + '">' + sub_link + '</a>', data, 1)
+		else:
+			break
+
+	data = re.sub('<test_a', '', data)
+
+	while 1:
+		in_data = re.search('<a((?:(?!>).)+)>((?:(?!<\/a>).)+)<\/a>', data)
+		if in_data:
+			in_data = in_data.groups()
+
+			a_data = re.sub('href="((?:(?!").)+)"', '', in_data[0])
+			a_data = re.sub('id="((?:(?!").)+)"', '', a_data)
+
+			if re.search('=', a_data):
+				data = re.sub('<a((?:(?!>).)+)>((?:(?!<\/a>).)+)<\/a>', '', data, 1)
+			else:
+				data = re.sub('<a((?:(?!>).)+)>((?:(?!<\/a>).)+)<\/a>', '<test_a ' + in_data[0] + '>' + in_data[1] + '</a>', data, 1)
+		else:
+			break
+
+	data = re.sub('<test_a', '<a', data)
+
+	ok_list = [
+		'h1',
+		'h2',
+		'h3',
+		'h4',
+		'h5',
+		'h6',
+		'li',
+		'a'
+	]
+
+	data = tool.xss_protect(curs, data, ok_list)
 	
-	return data
+	return [data, plus_data, backlink]

+ 1 - 30
set_mark/namu.py

@@ -458,36 +458,7 @@ def namu(conn, data, title, main_num):
 
     data = re.sub('&amp;', '&', data)
 
-    curs.execute('select html from html_filter where kind = ""')
-    html_db = curs.fetchall()
-
-    src_list = ["www.youtube.com", "serviceapi.nmv.naver.com", "tv.kakao.com", "www.google.com", "serviceapi.rmcnmv.naver.com"]
-    html_list = ['div', 'span', 'embed', 'iframe', 'ruby', 'rp', 'rt']
-    
-    html_data = re.findall('&lt;(\/)?((?:(?!&gt;| ).)+)( (?:(?:(?!&gt;).)+)?)?&gt;', data)
-    for in_data in html_data:
-        if in_data[0] == '':
-            if in_data[1] in html_list or (html_db and in_data[1] in html_db[0]):
-                if re.search('&lt;\/' + in_data[1] + '&gt;', data):
-                    src = re.search('src=([^ ]*)', in_data[2])
-                    if src:
-                        v_src = re.search('http(?:s)?:\/\/([^/\'" ]*)', src.groups()[0])
-                        if v_src:
-                            if not v_src.groups()[0] in src_list:
-                                and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', re.sub('src=([^ ]*)', '', in_data[2])))
-                            else:
-                                and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', in_data[2]))
-                        else:
-                            and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', re.sub('src=([^ ]*)', '', in_data[2])))
-                    else:
-                        and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', in_data[2]))
-                        
-
-                    data = data.replace('&lt;' + in_data[1] + in_data[2] + '&gt;', '<' + in_data[1] + and_data + '>', 1)
-                    data = re.sub('&lt;\/' + in_data[1] + '&gt;', '</' + in_data[1] + '>', data, 1)
-
-    position = re.compile('position', re.I)
-    data = position.sub('', data)
+    data = tool.xss_protect(curs, data)
 
     data = re.sub('\n( +)\|\|', '\n||', data)
     data = re.sub('\|\|( +)\n', '||\n', data)

+ 34 - 1
set_mark/tool.py

@@ -41,4 +41,37 @@ def sha224(data):
     return hashlib.sha224(bytes(data, 'utf-8')).hexdigest()
 
 def md5_replace(data):
-    return hashlib.md5(data.encode()).hexdigest()
+    return hashlib.md5(data.encode()).hexdigest()
+
+def xss_protect(curs, data, ok_list = []):
+    curs.execute('select html from html_filter where kind = ""')
+    html_db = curs.fetchall()
+
+    src_list = ["www.youtube.com", "serviceapi.nmv.naver.com", "tv.kakao.com", "www.google.com", "serviceapi.rmcnmv.naver.com"]
+    html_list = ['div', 'span', 'embed', 'iframe', 'ruby', 'rp', 'rt'] + ok_list
+    
+    html_data = re.findall('&lt;(\/)?((?:(?!&gt;| ).)+)( (?:(?:(?!&gt;).)+)?)?&gt;', data)
+    for in_data in html_data:
+        if in_data[0] == '':
+            if in_data[1] in html_list or (html_db and in_data[1] in html_db[0]):
+                if re.search('&lt;\/' + in_data[1] + '&gt;', data):
+                    src = re.search('src=([^ ]*)', in_data[2])
+                    if src:
+                        v_src = re.search('http(?:s)?:\/\/([^/\'" ]*)', src.groups()[0])
+                        if v_src:
+                            if not v_src.groups()[0] in src_list:
+                                and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', re.sub('src=([^ ]*)', '', in_data[2])))
+                            else:
+                                and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', in_data[2]))
+                        else:
+                            and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', re.sub('src=([^ ]*)', '', in_data[2])))
+                    else:
+                        and_data = re.sub('&#x27;', '\'', re.sub('&quot;', '"', in_data[2]))
+                        
+                    data = data.replace('&lt;' + in_data[1] + in_data[2] + '&gt;', '<' + in_data[1] + and_data + '>', 1)
+                    data = re.sub('&lt;\/' + in_data[1] + '&gt;', '</' + in_data[1] + '>', data, 1)
+
+    position = re.compile('position', re.I)
+    data = position.sub('', data)
+
+    return data