找个手机归属地数据库:https://github.com/ls0f/phone
首先我们先将dat的数据文件导出成txt格式的
def export(self): for i in range(0, int(self.phone_record_count)): current_offset = int(self.first_phone_record_offset + i * self.phone_fmt_length) buffer = self.buf[current_offset: current_offset + self.phone_fmt_length] cur_phone, record_offset, phone_type = struct.unpack(self.phone_fmt, buffer) record_content = get_record_content(self.buf, record_offset) yield Phone._format_phone_content(cur_phone, record_content, phone_type)
根据步骤1,列出所有的前三位
130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 145, 147, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 170, 171, 172, 173, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189
根据步骤2,构建一棵树
threePrefix =[130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 145, 147, 149, 150, 151, 152, 153, 155, 156, 157, 158, 159, 170, 171, 172, 173, 175, 176, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189]tree = {}for prefix in threePrefix: # first = prefix // 100 # always 1 second = prefix // 10 % 10 thirds = tree.setdefault(second, []) third = prefix % 10 if third not in thirds: thirds.append(third) thirds.sort()
{3: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 4: [5, 7, 9], 5: [0, 1, 2, 3, 5, 6, 7, 8, 9], 7: [0, 1, 2, 3, 5, 6, 7, 8], 8: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
根据步骤3合并区间
def calcRegexRanges(nums): start = nums[0] end = nums[0] xs = [] for s in nums: if s > end + 1: # print(v.get('seconds'), start, end) if start == end: xs.append(str(start)) elif end - start == 9: xs.append('\\d') else: xs.append(str(start) + '-' + str(end)) start = s end = start else: end = s # print(v.get('seconds'), start, end) if start == end: xs.append(str(start)) elif end - start == 9: xs.append('\\d') else: xs.append(str(start) + '-' + str(end)) # print('xs: ', xs) if len(xs) == 1 and '-' not in xs[0]: rs = xs[0] else: rs = '[' + ''.join(xs) + ']' return rsgroupped = {}for k,v in sorted(tree.items()): i = groupped.setdefault(str(v), {}) i.setdefault('thirds', v) i.setdefault('seconds', []).append(k)segs = []for k,v in sorted(groupped.items()): ssp = calcRegexRanges(v.get('seconds')) tsp = calcRegexRanges(v.get('thirds')) segs.append(ssp + tsp)regex = '1(' + '|'.join(segs) + ')\\d{8}'print(regex)
结果
1([38]\d|5[0-35-9]|7[0-35-8]|4[579])\d{8}
联系客服