PYTHON
Dưới đây là một trình tạo regex tự khớp. Bạn cung cấp hai danh sách, một danh sách chứa dữ liệu đào tạo mà regex phải khớp (ngoài việc khớp chính nó), danh sách còn lại chứa dữ liệu đào tạo mà regex KHÔNG nên khớp:
from random import choice, randrange
import re
from itertools import zip_longest, chain, islice
from operator import itemgetter
CHAR_SET = [chr(i) for i in range(128)] + [r"\\", r"\d", r"\D",
r"\w", r"\W", r"\s",
r"\S", r"?:", r"\1",
r"\2", r"\A", r"\b",
r"\B", r"\Z", r"\.",
r"\[", r"\]", r"\(",
r"\)", r"\{", r"\}",
r"\+", r"\|", r"\?",
r"\*"]
CHAR_SAMPLE = []
BREAKPOINT = re.compile(
r"""
\(.*?\)|
\[.*?\]|
\{.*?\}|
\w+(?=[\(\[\{])?|
\S+?|
\.\*\??|
\.\+\??|
\.\?\??|
\\.|
.*?
""",
re.VERBOSE)
MATCH_BRACKETS = {'(': ')', '[': ']', '{': '}'}
CLOSE_BRACKETS = {')', ']', '}'}
REGEX_SEEDER = [
r".*?",
r"(?:.*?)",
r"\w|\s",
r"(?<.*?)",
r"(?=.*?)",
r"(?!.*?)",
r"(?<=.*?)",
r"(?<!.*?)",
]
LEN_LIMIT = 100
def distribute(distribution):
global CHAR_SAMPLE
for item in CHAR_SET:
if item in distribution:
CHAR_SAMPLE.extend([item] * distribution[item])
else:
CHAR_SAMPLE.append(item)
def rand_index(seq, stop=None):
if stop is None:
stop = len(seq)
try:
return randrange(0, stop)
except ValueError:
return 0
def rand_slice(seq):
try:
start = randrange(0, len(seq))
stop = randrange(start, len(seq))
return slice(start, stop)
except ValueError:
return slice(0, 0)
#Mutation Functions
def replace(seq):
seq[rand_index(seq)] = choice(CHAR_SAMPLE)
def delete(seq):
del seq[rand_index(seq)]
def insert(seq):
seq.insert(rand_index(seq, len(seq) + 1), choice(CHAR_SAMPLE))
def duplicate(seq):
source = rand_slice(seq)
seq[source.stop: source.stop] = seq[source]
def swap(seq):
if len(seq) < 2: return
a = rand_index(seq, len(seq) - 1)
seq[a], seq[a + 1] = seq[a + 1], seq[a]
dummy = lambda seq: None
MUTATE = (
replace,
delete,
insert,
duplicate,
swap,
dummy,
dummy,
)
def repair_brackets(seq):
"""Attempts to lower the percentage of invalid regexes by
matching orphaned brackets"""
p_stack, new_seq = [], []
for item in seq:
if item in MATCH_BRACKETS:
p_stack.append(item)
elif item in CLOSE_BRACKETS:
while p_stack and MATCH_BRACKETS[p_stack[-1]] != item:
new_seq.append(MATCH_BRACKETS[p_stack[-1]])
p_stack.pop()
if not p_stack:
continue
else:
p_stack.pop()
new_seq.append(item)
while p_stack:
new_seq.append(MATCH_BRACKETS[p_stack.pop()])
return new_seq
def compress(seq):
new_seq = [seq[0]]
last_match = seq[0]
repeat = 1
for item in islice(seq, 1, len(seq)):
if item == last_match:
repeat += 1
else:
if repeat > 1:
new_seq.extend(list("{{{0}}}".format(repeat)))
new_seq.append(item)
last_match = item
repeat = 1
else:
if repeat > 1:
new_seq.extend(list("{{{0}}}".format(repeat)))
return new_seq
def mutate(seq):
"""Random in-place mutation of sequence"""
if len(seq) > LEN_LIMIT:
seq[:] = seq[:LEN_LIMIT]
c = choice(MUTATE)
c(seq)
def crossover(seqA, seqB):
"""Recombination of two sequences at optimal breakpoints
along each regex strand"""
bpA = [item.start() for item in BREAKPOINT.finditer(''.join(seqA))]
bpB = [item.start() for item in BREAKPOINT.finditer(''.join(seqA))]
slObjA = (slice(*item) for item in zip(bpA, bpA[1:]))
slObjB = (slice(*item) for item in zip(bpB, bpB[1:]))
slices = zip_longest(
(seqA[item] for item in slObjA),
(seqB[item] for item in slObjB),
fillvalue=[]
)
recombinant = (choice(item) for item in slices)
return list(chain.from_iterable(recombinant))
#Fitness testing
def match_percentage(match):
"""Calculates the percentage a text actually matched
by a regular expression"""
if match and match.endpos:
return (match.end() - match.start()) / match.endpos
else:
return 0.001
def fitness_test(seq, pos_matches, neg_matches):
"""Scoring algorithm to determine regex fitness"""
try:
self_str = ''.join(seq)
regex = re.compile(self_str)
except (re.error, IndexError):
seq[:] = repair_brackets(seq)
try:
self_str = ''.join(seq)
regex = re.compile(self_str)
except (re.error, IndexError):
return 0.001
pos_score = sum(match_percentage(regex.search(item))
for item in pos_matches) / len(pos_matches) / 3
neg_score = (1 - sum(match_percentage(regex.search(item))
for item in neg_matches) / len(neg_matches)) / 3
self_score = match_percentage(regex.search(self_str)) / 3
return pos_score + self_score + neg_score
#Population Management
def generate_pop(pos_matches, neg_matches, pop_size):
sources = (pos_matches, REGEX_SEEDER)
return [crossover(
choice(choice(sources)), choice(choice(sources))
) for i in range(pop_size)]
def glean_pop(population, cutoff, fit_test, ft_args=()):
scores = (fit_test(bug, *ft_args) for bug in population)
ranked = sorted(zip(population, scores), key=itemgetter(1), reverse=True)
maxItem = ranked[0]
new_pop = next(zip(*ranked))[:cutoff]
return maxItem, new_pop
def repopulate(population, pop_size):
cutoff = len(population)
for i in range(pop_size // cutoff):
population.extend([crossover(choice(population), choice(population))
for i in range(cutoff)])
population.extend([population[i][:] for i in range(pop_size - len(population))])
#Simulator
def simulate(pos_matches, neg_matches, pop_size=50, cutoff=10, threshold=1.0):
population = generate_pop(pos_matches, neg_matches, pop_size)
while True:
for bug in population:
mutate(bug)
#Scoring step
max_item, population = glean_pop(
population,
cutoff,
fitness_test,
(pos_matches, neg_matches)
)
#Exit condition:
max_regex, max_score = max_item
if max_score >= threshold:
return max_score, max_regex
"""
print(max_score, ''.join(max_regex))
input("next?")"""
#Repopulation Step:
population = list(population)
repopulate(population, pop_size)