Tôi tin rằng một cách tốt hơn để làm điều đó là sử dụng một cỗ máy trạng thái. Dưới đây là mã mẫu mà tôi đã tìm ra bằng cách chuyển đổi mã NodeJS trên liên kết bên dưới sang Python 3 (từ khóa phi địa phương được sử dụng chỉ có sẵn trong Python 3, mã sẽ không hoạt động trên Python 2)
Chỉnh sửa-1: Đã cập nhật và tạo mã tương thích với Python 2
Chỉnh sửa-2: Đã cập nhật và thêm phiên bản chỉ Python3
https://gist.github.com/creationix/5992451
Phiên bản chỉ dành cho Python 3
# A streaming byte oriented JSON parser. Feed it a single byte at a time and
# it will emit complete objects as it comes across them. Whitespace within and
# between objects is ignored. This means it can parse newline delimited JSON.
import math
def json_machine(emit, next_func=None):
def _value(byte_data):
if not byte_data:
return
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _value # Ignore whitespace
if byte_data == 0x22: # "
return string_machine(on_value)
if byte_data == 0x2d or (0x30 <= byte_data < 0x40): # - or 0-9
return number_machine(byte_data, on_number)
if byte_data == 0x7b: #:
return object_machine(on_value)
if byte_data == 0x5b: # [
return array_machine(on_value)
if byte_data == 0x74: # t
return constant_machine(TRUE, True, on_value)
if byte_data == 0x66: # f
return constant_machine(FALSE, False, on_value)
if byte_data == 0x6e: # n
return constant_machine(NULL, None, on_value)
if next_func == _value:
raise Exception("Unexpected 0x" + str(byte_data))
return next_func(byte_data)
def on_value(value):
emit(value)
return next_func
def on_number(number, byte):
emit(number)
return _value(byte)
next_func = next_func or _value
return _value
TRUE = [0x72, 0x75, 0x65]
FALSE = [0x61, 0x6c, 0x73, 0x65]
NULL = [0x75, 0x6c, 0x6c]
def constant_machine(bytes_data, value, emit):
i = 0
length = len(bytes_data)
def _constant(byte_data):
nonlocal i
if byte_data != bytes_data[i]:
i += 1
raise Exception("Unexpected 0x" + str(byte_data))
i += 1
if i < length:
return _constant
return emit(value)
return _constant
def string_machine(emit):
string = ""
def _string(byte_data):
nonlocal string
if byte_data == 0x22: # "
return emit(string)
if byte_data == 0x5c: # \
return _escaped_string
if byte_data & 0x80: # UTF-8 handling
return utf8_machine(byte_data, on_char_code)
if byte_data < 0x20: # ASCII control character
raise Exception("Unexpected control character: 0x" + str(byte_data))
string += chr(byte_data)
return _string
def _escaped_string(byte_data):
nonlocal string
if byte_data == 0x22 or byte_data == 0x5c or byte_data == 0x2f: # " \ /
string += chr(byte_data)
return _string
if byte_data == 0x62: # b
string += "\b"
return _string
if byte_data == 0x66: # f
string += "\f"
return _string
if byte_data == 0x6e: # n
string += "\n"
return _string
if byte_data == 0x72: # r
string += "\r"
return _string
if byte_data == 0x74: # t
string += "\t"
return _string
if byte_data == 0x75: # u
return hex_machine(on_char_code)
def on_char_code(char_code):
nonlocal string
string += chr(char_code)
return _string
return _string
# Nestable state machine for UTF-8 Decoding.
def utf8_machine(byte_data, emit):
left = 0
num = 0
def _utf8(byte_data):
nonlocal num, left
if (byte_data & 0xc0) != 0x80:
raise Exception("Invalid byte in UTF-8 character: 0x" + byte_data.toString(16))
left = left - 1
num |= (byte_data & 0x3f) << (left * 6)
if left:
return _utf8
return emit(num)
if 0xc0 <= byte_data < 0xe0: # 2-byte UTF-8 Character
left = 1
num = (byte_data & 0x1f) << 6
return _utf8
if 0xe0 <= byte_data < 0xf0: # 3-byte UTF-8 Character
left = 2
num = (byte_data & 0xf) << 12
return _utf8
if 0xf0 <= byte_data < 0xf8: # 4-byte UTF-8 Character
left = 3
num = (byte_data & 0x07) << 18
return _utf8
raise Exception("Invalid byte in UTF-8 string: 0x" + str(byte_data))
# Nestable state machine for hex escaped characters
def hex_machine(emit):
left = 4
num = 0
def _hex(byte_data):
nonlocal num, left
if 0x30 <= byte_data < 0x40:
i = byte_data - 0x30
elif 0x61 <= byte_data <= 0x66:
i = byte_data - 0x57
elif 0x41 <= byte_data <= 0x46:
i = byte_data - 0x37
else:
raise Exception("Expected hex char in string hex escape")
left -= 1
num |= i << (left * 4)
if left:
return _hex
return emit(num)
return _hex
def number_machine(byte_data, emit):
sign = 1
number = 0
decimal = 0
esign = 1
exponent = 0
def _mid(byte_data):
if byte_data == 0x2e: # .
return _decimal
return _later(byte_data)
def _number(byte_data):
nonlocal number
if 0x30 <= byte_data < 0x40:
number = number * 10 + (byte_data - 0x30)
return _number
return _mid(byte_data)
def _start(byte_data):
if byte_data == 0x30:
return _mid
if 0x30 < byte_data < 0x40:
return _number(byte_data)
raise Exception("Invalid number: 0x" + str(byte_data))
if byte_data == 0x2d: # -
sign = -1
return _start
def _decimal(byte_data):
nonlocal decimal
if 0x30 <= byte_data < 0x40:
decimal = (decimal + byte_data - 0x30) / 10
return _decimal
return _later(byte_data)
def _later(byte_data):
if byte_data == 0x45 or byte_data == 0x65: # E e
return _esign
return _done(byte_data)
def _esign(byte_data):
nonlocal esign
if byte_data == 0x2b: # +
return _exponent
if byte_data == 0x2d: # -
esign = -1
return _exponent
return _exponent(byte_data)
def _exponent(byte_data):
nonlocal exponent
if 0x30 <= byte_data < 0x40:
exponent = exponent * 10 + (byte_data - 0x30)
return _exponent
return _done(byte_data)
def _done(byte_data):
value = sign * (number + decimal)
if exponent:
value *= math.pow(10, esign * exponent)
return emit(value, byte_data)
return _start(byte_data)
def array_machine(emit):
array_data = []
def _array(byte_data):
if byte_data == 0x5d: # ]
return emit(array_data)
return json_machine(on_value, _comma)(byte_data)
def on_value(value):
array_data.append(value)
def _comma(byte_data):
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _comma # Ignore whitespace
if byte_data == 0x2c: # ,
return json_machine(on_value, _comma)
if byte_data == 0x5d: # ]
return emit(array_data)
raise Exception("Unexpected byte: 0x" + str(byte_data) + " in array body")
return _array
def object_machine(emit):
object_data = {}
key = None
def _object(byte_data):
if byte_data == 0x7d: #
return emit(object_data)
return _key(byte_data)
def _key(byte_data):
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _object # Ignore whitespace
if byte_data == 0x22:
return string_machine(on_key)
raise Exception("Unexpected byte: 0x" + str(byte_data))
def on_key(result):
nonlocal key
key = result
return _colon
def _colon(byte_data):
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _colon # Ignore whitespace
if byte_data == 0x3a: # :
return json_machine(on_value, _comma)
raise Exception("Unexpected byte: 0x" + str(byte_data))
def on_value(value):
object_data[key] = value
def _comma(byte_data):
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _comma # Ignore whitespace
if byte_data == 0x2c: # ,
return _key
if byte_data == 0x7d: #
return emit(object_data)
raise Exception("Unexpected byte: 0x" + str(byte_data))
return _object
Phiên bản tương thích Python 2
# A streaming byte oriented JSON parser. Feed it a single byte at a time and
# it will emit complete objects as it comes across them. Whitespace within and
# between objects is ignored. This means it can parse newline delimited JSON.
import math
def json_machine(emit, next_func=None):
def _value(byte_data):
if not byte_data:
return
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _value # Ignore whitespace
if byte_data == 0x22: # "
return string_machine(on_value)
if byte_data == 0x2d or (0x30 <= byte_data < 0x40): # - or 0-9
return number_machine(byte_data, on_number)
if byte_data == 0x7b: #:
return object_machine(on_value)
if byte_data == 0x5b: # [
return array_machine(on_value)
if byte_data == 0x74: # t
return constant_machine(TRUE, True, on_value)
if byte_data == 0x66: # f
return constant_machine(FALSE, False, on_value)
if byte_data == 0x6e: # n
return constant_machine(NULL, None, on_value)
if next_func == _value:
raise Exception("Unexpected 0x" + str(byte_data))
return next_func(byte_data)
def on_value(value):
emit(value)
return next_func
def on_number(number, byte):
emit(number)
return _value(byte)
next_func = next_func or _value
return _value
TRUE = [0x72, 0x75, 0x65]
FALSE = [0x61, 0x6c, 0x73, 0x65]
NULL = [0x75, 0x6c, 0x6c]
def constant_machine(bytes_data, value, emit):
local_data = {"i": 0, "length": len(bytes_data)}
def _constant(byte_data):
# nonlocal i, length
if byte_data != bytes_data[local_data["i"]]:
local_data["i"] += 1
raise Exception("Unexpected 0x" + byte_data.toString(16))
local_data["i"] += 1
if local_data["i"] < local_data["length"]:
return _constant
return emit(value)
return _constant
def string_machine(emit):
local_data = {"string": ""}
def _string(byte_data):
# nonlocal string
if byte_data == 0x22: # "
return emit(local_data["string"])
if byte_data == 0x5c: # \
return _escaped_string
if byte_data & 0x80: # UTF-8 handling
return utf8_machine(byte_data, on_char_code)
if byte_data < 0x20: # ASCII control character
raise Exception("Unexpected control character: 0x" + byte_data.toString(16))
local_data["string"] += chr(byte_data)
return _string
def _escaped_string(byte_data):
# nonlocal string
if byte_data == 0x22 or byte_data == 0x5c or byte_data == 0x2f: # " \ /
local_data["string"] += chr(byte_data)
return _string
if byte_data == 0x62: # b
local_data["string"] += "\b"
return _string
if byte_data == 0x66: # f
local_data["string"] += "\f"
return _string
if byte_data == 0x6e: # n
local_data["string"] += "\n"
return _string
if byte_data == 0x72: # r
local_data["string"] += "\r"
return _string
if byte_data == 0x74: # t
local_data["string"] += "\t"
return _string
if byte_data == 0x75: # u
return hex_machine(on_char_code)
def on_char_code(char_code):
# nonlocal string
local_data["string"] += chr(char_code)
return _string
return _string
# Nestable state machine for UTF-8 Decoding.
def utf8_machine(byte_data, emit):
local_data = {"left": 0, "num": 0}
def _utf8(byte_data):
# nonlocal num, left
if (byte_data & 0xc0) != 0x80:
raise Exception("Invalid byte in UTF-8 character: 0x" + byte_data.toString(16))
local_data["left"] -= 1
local_data["num"] |= (byte_data & 0x3f) << (local_data["left"] * 6)
if local_data["left"]:
return _utf8
return emit(local_data["num"])
if 0xc0 <= byte_data < 0xe0: # 2-byte UTF-8 Character
local_data["left"] = 1
local_data["num"] = (byte_data & 0x1f) << 6
return _utf8
if 0xe0 <= byte_data < 0xf0: # 3-byte UTF-8 Character
local_data["left"] = 2
local_data["num"] = (byte_data & 0xf) << 12
return _utf8
if 0xf0 <= byte_data < 0xf8: # 4-byte UTF-8 Character
local_data["left"] = 3
local_data["num"] = (byte_data & 0x07) << 18
return _utf8
raise Exception("Invalid byte in UTF-8 string: 0x" + str(byte_data))
# Nestable state machine for hex escaped characters
def hex_machine(emit):
local_data = {"left": 4, "num": 0}
def _hex(byte_data):
# nonlocal num, left
i = 0 # Parse the hex byte
if 0x30 <= byte_data < 0x40:
i = byte_data - 0x30
elif 0x61 <= byte_data <= 0x66:
i = byte_data - 0x57
elif 0x41 <= byte_data <= 0x46:
i = byte_data - 0x37
else:
raise Exception("Expected hex char in string hex escape")
local_data["left"] -= 1
local_data["num"] |= i << (local_data["left"] * 4)
if local_data["left"]:
return _hex
return emit(local_data["num"])
return _hex
def number_machine(byte_data, emit):
local_data = {"sign": 1, "number": 0, "decimal": 0, "esign": 1, "exponent": 0}
def _mid(byte_data):
if byte_data == 0x2e: # .
return _decimal
return _later(byte_data)
def _number(byte_data):
# nonlocal number
if 0x30 <= byte_data < 0x40:
local_data["number"] = local_data["number"] * 10 + (byte_data - 0x30)
return _number
return _mid(byte_data)
def _start(byte_data):
if byte_data == 0x30:
return _mid
if 0x30 < byte_data < 0x40:
return _number(byte_data)
raise Exception("Invalid number: 0x" + byte_data.toString(16))
if byte_data == 0x2d: # -
local_data["sign"] = -1
return _start
def _decimal(byte_data):
# nonlocal decimal
if 0x30 <= byte_data < 0x40:
local_data["decimal"] = (local_data["decimal"] + byte_data - 0x30) / 10
return _decimal
return _later(byte_data)
def _later(byte_data):
if byte_data == 0x45 or byte_data == 0x65: # E e
return _esign
return _done(byte_data)
def _esign(byte_data):
# nonlocal esign
if byte_data == 0x2b: # +
return _exponent
if byte_data == 0x2d: # -
local_data["esign"] = -1
return _exponent
return _exponent(byte_data)
def _exponent(byte_data):
# nonlocal exponent
if 0x30 <= byte_data < 0x40:
local_data["exponent"] = local_data["exponent"] * 10 + (byte_data - 0x30)
return _exponent
return _done(byte_data)
def _done(byte_data):
value = local_data["sign"] * (local_data["number"] + local_data["decimal"])
if local_data["exponent"]:
value *= math.pow(10, local_data["esign"] * local_data["exponent"])
return emit(value, byte_data)
return _start(byte_data)
def array_machine(emit):
local_data = {"array_data": []}
def _array(byte_data):
if byte_data == 0x5d: # ]
return emit(local_data["array_data"])
return json_machine(on_value, _comma)(byte_data)
def on_value(value):
# nonlocal array_data
local_data["array_data"].append(value)
def _comma(byte_data):
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _comma # Ignore whitespace
if byte_data == 0x2c: # ,
return json_machine(on_value, _comma)
if byte_data == 0x5d: # ]
return emit(local_data["array_data"])
raise Exception("Unexpected byte: 0x" + str(byte_data) + " in array body")
return _array
def object_machine(emit):
local_data = {"object_data": {}, "key": ""}
def _object(byte_data):
# nonlocal object_data, key
if byte_data == 0x7d: #
return emit(local_data["object_data"])
return _key(byte_data)
def _key(byte_data):
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _object # Ignore whitespace
if byte_data == 0x22:
return string_machine(on_key)
raise Exception("Unexpected byte: 0x" + byte_data.toString(16))
def on_key(result):
# nonlocal object_data, key
local_data["key"] = result
return _colon
def _colon(byte_data):
# nonlocal object_data, key
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _colon # Ignore whitespace
if byte_data == 0x3a: # :
return json_machine(on_value, _comma)
raise Exception("Unexpected byte: 0x" + str(byte_data))
def on_value(value):
# nonlocal object_data, key
local_data["object_data"][local_data["key"]] = value
def _comma(byte_data):
# nonlocal object_data
if byte_data == 0x09 or byte_data == 0x0a or byte_data == 0x0d or byte_data == 0x20:
return _comma # Ignore whitespace
if byte_data == 0x2c: # ,
return _key
if byte_data == 0x7d: #
return emit(local_data["object_data"])
raise Exception("Unexpected byte: 0x" + str(byte_data))
return _object
Kiểm tra nó
if __name__ == "__main__":
test_json = """[1,2,"3"] {"name":
"tarun"} 1 2
3 [{"name":"a",
"data": [1,
null,2]}]
"""
def found_json(data):
print(data)
state = json_machine(found_json)
for char in test_json:
state = state(ord(char))
Đầu ra của cùng là
[1, 2, '3']
{'name': 'tarun'}
1
2
3
[{'name': 'a', 'data': [1, None, 2]}]