''' 大致思路 建立一个统一的数据结构 解析源数据 解析泄露数据,查找在源数据中的位置 ''' import os import re from tqdm import tqdm import json data = {} ''' filename: { data: [], meta: { header: [], filename: '', length: 0 }, p_phone: {} } ''' varibs = ['phone', 'cert_no', 'address', 'email', 'bank_card', 'name'] result = [] valid_phone_prefix = [334,335,336,337,338,339,347,350,351,352,357,358,359,378,382,383,384,387,388,330,331,332,345,355,356,385,386,376,375,333,349,353,380,381,389,377,399,391,393,395,398] valid_bank_card_prefix = [922,883,165,221,880,584,768,908,197,130,956,500,723,881,886,839,922,335,706,205,132,123,127,766,110,803,322,844,130,641,328,883,667,339,337,794,325,880,571,397,123,958,758,203,291,745,842,981,224,861,839,613,950,787,295,391,702,612,967,618,704,974,136,592,862,917,781,278,662,895,791,856,189,780,211,898,268,961,851,131,581,145,969,730,708,693,763,275,273,615,333,113,890,981,662,338,692,571,776,662,724,847,106,903,978,941,859,929,233,632,897,158,593,991,684,668,617,597,933,102,739,906,728,570,715,129,924,336,751,282,699,286,982,194,918,665,917,972,937,134,962,790,173,186,118,115,873,874,388,213,917,740,290,398,172,272,264,641,273,773,380,764,586,217,125,916,293,212,848,721,937,122,331,119,507,250,137,837,265,619,795,953,326,746,917,562,329,765,132,505,792,923,776,747,161,856,229,997,149,173,980,179,862,263,679,359,234,109,971,139,705,323,684,572,276,948,991,899,738,139,306,202,791,706,299,207,782,500,617,612,118,727,993,389,119,261,306,979,931,902,239,791,199,958,661,958,804,597,887,167,142,187,237,274,271,878,715,619,962,217,399,719,899,833,239,665,889,207,141,175,987,908,251,949,229,218,730,702,901,179,330,680,184,678,218,569,384,211,906,147,948,113,729,787,115,194,218,946,141,293,701,266,219,562,272,798,263,862,206,926,938,203,768,141,128,111,906,844,711,509,165,166] def is_valid_cert_no(n): var = [7,9,10,5,8,4,2,1,6,3,7,9,10,5,8,4,2] var_id=['1','0','x','9','8','7','6','5','4','3','2'] sum = 0 print for i in range(0,17): sum += int(n[i])*var[i] sum %= 11 return var_id[sum] == n[17].lower() def format_var(val, format): if format == 'phone': res = re.findall('\(?\+?8?6?\)? ?(\d{3}).?(\d{4}).?(\d{4})\(?8?6?\)?', val) return ''.join(res[0]) else: return val def work_for_table(path, filename): global data data[filename] = { 'data': None, 'meta': { 'filename': filename } } with open(path, encoding='utf-8') as f: row = f.readline().strip() row = row.split(',')[1:] data[filename]['meta']['header'] = row.copy() _d = [] for line in f.readlines(): line = line.strip().split(',')[1:] _d.append(line) lng = len(_d) data[filename]['data'] = _d.copy() data[filename]['meta']['length'] = lng for index, item in enumerate(row): _d = {} if item.lower() in varibs: for i in range(lng): _d[format_var(data[filename]['data'][i][index], item.lower())] = i data[filename][f'p_{item.lower()}'] = _d.copy() def locate_file(val, type): global data fval = format_var(val, type) for key in data: if type in data[key]['meta']['header']: idx = data[key][f'p_{type}'].get(fval, None) pos = data[key]['meta']['header'].index(type) if idx is not None: return (key, type, data[key]['data'][idx][pos]) def leakwork_for_table(path, filename): global result with open(path, encoding='utf-8') as f: row = f.readline().strip() row = row.split(',')[1:] _d = [] for line in f.readlines(): line = line.strip().split(',') _d.append(line) lng = len(_d) for index, item in enumerate(row): if item.lower() in varibs: for i in range(lng): res = locate_file(_d[i][index+1], item.lower()) if res is not None: # print(res) result.append(','.join((filename,)+res) + '\n') def parse_content(lines): if len(lines) == 1: if lines[0][0] == '{': lines = lines[0].strip().split(' {') _d = [ json.loads(lines[0]) ] for line in lines[1:]: _d.append(json.loads('{'+line)) return _d, 'json' if lines[0].count('。') > 100: return lines[0], 'line' return None, None if len(lines[0].split(' ')) > 1: header = lines[0].strip().split(' ') line = [] for i in range(1, len(lines)): line.append(lines[i].strip().split(' ')) return { 'header': header, 'data': line, }, 'table' if len(lines[0].split(',')) > 1: header = lines[0].strip().split(',') line = [] for i in range(1, len(lines)): line.append(lines[i].strip().split(',')) return { 'header': header, 'data': line, }, 'table' if len(lines[0].split('\t')) > 1: header = lines[0].strip().split('\t') line = [] for i in range(1, len(lines)): line.append(lines[i].strip().split('\t')) return { 'header': header, 'data': line, }, 'table' if len(lines[0].split('|')) > 1: header = lines[0].strip().split('|') line = [] for i in range(1, len(lines)): line.append(lines[i].strip().split('|')) return { 'header': header, 'data': line, }, 'table' if len(lines[0].split(':')) > 1: header = lines[0].strip().split(':') line = [] for i in range(1, len(lines)): line.append(lines[i].strip().split(':')) return { 'header': header, 'data': line, }, 'table' if len(lines[0].split(';')) > 1: header = lines[0].strip().split(';') line = [] for i in range(1, len(lines)): line.append(lines[i].strip().split(';')) return { 'header': header, 'data': line, }, 'table' return None, None def parse_item(val): res = re.findall('([\u4e00-\u9fa5·]{2,3})', val) valid_name = "张王徐习李朱岳赵钱孙周吴郑冯陈魏蒋沈韩粱江许黄胡程陆卢鲁刘柳花仓苍常嫦吕林郭闵" if res: if res[0] in valid_name: return res[0], 'name' res = re.findall('(\(?\+?8?6?\)? ?(\d{3}).?(\d{4}).?(\d{4})\(?8?6?\)?)', val) if res: if int(res[0][1]) in valid_phone_prefix: return res[0][0], 'phone' res = re.findall(r'([a-zA-Z0-9_-]+(?:\.[a-zA-Z0-9_-]+)*@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*\.[a-z]{2,6})', val) if res: return res[0], 'email' for w in ['学校', '大学', '学院', '中学', '小学', '幼儿园', '培训中心', '分部', '分校', '房', '号', '院', '村', '区', '县', '市', '省']: if val.endswith(w): return val, 'address' val = re.findall('[^\u4e00-\u9fa5]+',val) if not val: return None, None val = val[0] s = '' for i in val: if ord('0') <= ord(i) <= ord('9'): s += i elif len(s) == 17 and i == 'x' or i == 'X': s += i for ii in s[:-1]: if ii not in "1234567890": return None, None if 13 <= len(s) <= 17: if int(s[:3]) in valid_bank_card_prefix: return val, 'bank_card' elif len(s) == 18: if 13 <= int(s[10:12]) <= 24: return val, 'cert_no' return None, None def work_for_txt(path, filename): global result data[filename] = { 'data': None, 'meta': { 'filename': filename } } with open(path, encoding='utf-8') as f: lines = f.readlines() content,type = parse_content(lines) if type == 'table': data[filename]['meta']['header'] = content['header'].copy() data[filename]['meta']['length'] = len(content['data']) data[filename]['data'] = content['data'].copy() for index, item in enumerate(data[filename]['meta']['header']): _d = {} if item.lower() in varibs: for i in range(data[filename]['meta']['length']): _d[format_var(data[filename]['data'][i][index], item.lower())] = i data[filename][f'p_{item.lower()}'] = _d.copy() elif type == 'json': data[filename]['meta']['header'] = list(content[0].keys()) data[filename]['meta']['length'] = len(content) lines = [] for line in content: _l = [] for item in data[filename]['meta']['header']: _l.append(line[item]) lines.append(_l) data[filename]['data'] = lines.copy() for index, item in enumerate(data[filename]['meta']['header']): _d = {} if item.lower() in varibs: for i in range(data[filename]['meta']['length']): _d[format_var(data[filename]['data'][i][index], item.lower())] = i data[filename][f'p_{item.lower()}'] = _d.copy() elif type == 'line': header = ['phone', 'cert_no', 'address', 'email', 'bank_card'] _data = [] # content = re.findall('(,([\u4e00-\u9fa5]+?),)|(,([^\u4e00-\u9fa5]+\d),?)', content) content = content.split(',') for item in content: _d = ['xenny']*5 v,type = parse_item(item) if v is not None: _d[header.index(type)] = v _data.append(_d) data[filename]['meta']['header'] = header.copy() data[filename]['meta']['length'] = len(_data) data[filename]['data'] = _data.copy() for index, item in enumerate(data[filename]['meta']['header']): _d = {} for i in range(data[filename]['meta']['length']): if data[filename]['data'][i][index] != 'xenny': _d[format_var(data[filename]['data'][i][index], item.lower())] = i data[filename][f'p_{item.lower()}'] = _d.copy() def leakwork_for_txt(path, filename): global result with open(path, encoding='utf-8') as f: lines = f.readlines() content,type = parse_content(lines) if type == 'table': for item in content['header']: if item.lower() in varibs: for v in content['data']: res = locate_file(v[content['header'].index(item.lower())], item.lower()) if res is not None: # print(res) result.append(','.join((filename,)+res) + '\n') elif type == 'json': for line in content: for k in line: if k.lower() in varibs: res = locate_file(line[k], k.lower()) if res is not None: result.append(','.join((filename,)+res) + '\n') elif type == 'line': # content = re.findall('(,(.+?),)|(,([^\u4e00-\u9fa5]+\d))', content) content = content.split(',') for item in content: v,type = parse_item(item) if v is not None: res = locate_file(v, type) if res is not None: result.append(','.join((filename,)+res) + '\n') # def leakwork_for_pic(path, filename): # global result # with open(path, encoding='utf-8') as f: # content = json.load(f)['content'] # if content.count(',') > 10: # content = content.split(',') # else: # content = content.split(' ') # for item in content: # for it in item.split(' '): # v,type = parse_item(it.strip().strip(',').strip()) # if v is not None: # res = locate_file(v, type) # if res is not None: # result.append(','.join((filename,)+res) + '\n') def leakwork_for_pic2(path, filename): global result with open(path, encoding='utf-8') as f: lines = f.readlines() content = "".join(lines) # content = re.findall('(,(.+?),)|(,([^\u4e00-\u9fa5]+\d))', content) content = content.split(',') for item in content: v,type = parse_item(item) if v is not None: res = locate_file(v, type) if res is not None: result.append(','.join((filename,)+res) + '\n') def work(): path1 = './内部数据/table_files' for filename in os.listdir(path1): work_for_table(os.path.join(path1, filename), filename) path2 = './内部数据/txt_files' for filename in os.listdir(path2): work_for_txt(os.path.join(path2, filename), filename) pass def leak_work(): path1 = './泄漏数据/table_files' for filename in tqdm(os.listdir(path1)): leakwork_for_table(os.path.join(path1, filename), filename) path2 = './泄漏数据/txt_files' for filename in tqdm(os.listdir(path2)): leakwork_for_txt(os.path.join(path2, filename), filename) path3 = './result' for filename in tqdm(os.listdir(path3)): # leakwork_for_pic(os.path.join(path3, filename), filename[:-4]) leakwork_for_pic2(os.path.join(path3, filename), filename) work() leak_work() with open('result.txt', 'w+', encoding='utf-8') as f: f.writelines(result)