前言
前段时间同事说他有个需求是比较两个JSON文件之间的差异点,身为DB大神的同事用SQL实现了这个需求,让只会CRUD的我直呼神乎其技。当时用一个一千万多字符、四十多万行的JSON文件来测试,SQL查出来要9秒。周六有时间,拜读了下同事的SQL,打算用Python和Go实现下试试。
测试的json文件如下,其中dst.json从src.json文件复制而来,随便找了个地方改了下。单文件为409510行,字符数为11473154左右。
$ wc -ml ./src.json dst.json 409510 11473154 ./src.json 409510 11473155 dst.json 819020 22946309 总计
第三方库jsondiff
先在网上搜了下有没有现成的第三方库,找到一个叫jsondiff的第三方python库。使用pip安装后,用法如下
import json import jsondiff import os from typing import Any def read_json(filepath: str) -> Any: if not os.path.exists(filepath): raise FileNotFoundError(filepath) try: with open(filepath, "r") as f: data = json.load(f) except json.JSONDecodeError as e: raise Exception(f"{filepath} is not a valid json file") from e else: return data if __name__ == "__main__": src_data = read_json("src.json") dst_data = read_json("dst.json") diffs = jsondiff.diff(src_data, dst_data) print(diffs)
运行测试
$ /usr/bin/time -f 'Elapsed Time: %e s Max RSS: %M kbytes' python third.py {'timepicker': {'time_options': {insert: [(7, '7dd')], delete: [7]}}} Elapsed Time: 1576.30 s Max RSS: 87732 kbytes
运行时间太长了,接近半小时,肯定不能拿给别人用。
Python-仅用标准库
只试了jsondiff这一个第三方库,接下来打算直接参考同事那个SQL的思路,自己只用标准库实现一个。
from typing import Any, List import json import os from dataclasses import dataclass from collections.abc import MutableSequence, MutableMapping @dataclass class DiffResult: path: str kind: str left: Any right: Any def add_path(parent: str, key: str) -> str: """将父路径和key name组合成完整的路径字符串""" if parent == "": return key else: return parent + "." + key def read_json(filepath: str) -> Any: if not os.path.exists(filepath): raise FileNotFoundError(filepath) try: with open(filepath, "r") as f: data = json.load(f) except json.JSONDecodeError as e: raise Exception(f"{filepath} is not a valid json file") from e else: return data def collect_diff(path: str, left: Any, right: Any) -> List[DiffResult]: """比较两个json数据结构之间的差异 Args: path (str): 当前路径 left (Any): 左侧数据 right (Any): 右侧数据 Returns: List[DiffResult]: 差异列表 """ diffs: List[DiffResult] = [] if isinstance(left, MutableMapping) and isinstance(right, MutableMapping): # 处理字典:检查 key 的增删改 all_keys = set(left.keys()) | set(right.keys()) # 左右两边字典中所有键的并集,用于后续比较这些键在两个字典中的存在情况及对应的值 for k in all_keys: l_exists = k in left r_exists = k in right key_path = add_path(path, k) if l_exists and not r_exists: # 如果一个键只存在于left,则记录为 removed 差异 diffs.append(DiffResult(key_path, "removed", left=left[k])) elif not l_exists and r_exists: # 如果一个键只存在于 right,则记录为 added 差异 diffs.append(DiffResult(key_path, "added", right=right[k])) else: # 都存在,递归比较这两个键对应的值 diffs.extend(collect_diff(key_path, left[k], right[k])) elif isinstance(left, MutableSequence) and isinstance(right, MutableSequence): # 处理列表:按索引比较 max_len = max(len(left), len(right)) # 找两个列表中最长的长度 for i in range(max_len): l_exists = i < len(left) r_exists = i < len(right) idx_path = f"{path}[{i}]" lv = left[i] if l_exists else None rv = right[i] if r_exists else None if l_exists and not r_exists: # 某个索引的元素只存在于 left,则记录为 removed 差异 diffs.append(DiffResult(idx_path, "removed", left=lv)) elif not l_exists and r_exists: # 某个索引的元素只存在于 right,则记录为 added 差异 diffs.append(DiffResult(idx_path, "added", right=rv)) else: # 都存在,递归比较这两个索引对应的值 diffs.extend(collect_diff(idx_path, lv, rv)) else: # 基本类型或类型不一致 if left != right: diffs.append(DiffResult(path, "modified", left=left, right=right)) return diffs if __name__ == "__main__": src_dict = read_json("src.json") dst_dict = read_json("dst.json") diffs = collect_diff("", src_dict, dst_dict) if len(diffs) == 0: print("No differences found.") else: print(f"Found {len(diffs)} differences:") for diff in diffs: match diff.kind: case "added": print(f"Added: {diff.path}, {diff.right}") case "removed": print(f"Removed: {diff.path}, {diff.left}") case "modified": print(f"Modified: {diff.path}, {diff.left} -> {diff.right}") # print(diffs)
运行测试
$ /usr/bin/time -f 'Elapsed Time: %e s Max RSS: %M kbytes' python main.py Found 1 differences: Modified: timepicker.time_options[7], 7d -> 7dd Elapsed Time: 0.46 s Max RSS: 87976 kbytes
只要 0.46 秒就能比较出来差异点,单论比较性能来说,比jsondiff要好很多。
Go实现
再换go来实现个命令行工具,同样只需要用标准库即可。
package main import ( "encoding/json" "flag" "fmt" "io" "os" ) var ( src_file string dst_file string ) type DiffResult struct { Path string Kind string Left any Right any } func addPath(parent, key string) string { if parent == "" { return key } return parent + "." + key } func collectDiff(path string, left, right any) []DiffResult { var diffs []DiffResult switch l := left.(type) { case map[string]any: if r, ok := right.(map[string]any); ok { for k, lv := range l { rk, exists := r[k] if !exists { diffs = append(diffs, DiffResult{ Path: addPath(path, k), Kind: "removed", Left: lv, Right: nil, }) } else { diffs = append(diffs, collectDiff(addPath(path, k), lv, rk)...) } } for k, rv := range r { if _, exists := l[k]; !exists { diffs = append(diffs, DiffResult{ Path: addPath(path, k), Kind: "added", Left: nil, Right: rv, }) } } } else { diffs = append(diffs, DiffResult{ Path: path, Kind: "modified", Left: left, Right: right, }) } case []any: if r, ok := right.([]any); ok { // 比较 slice(这里简化:按索引比较) maxLen := len(l) if len(r) > maxLen { maxLen = len(r) } for i := 0; i < maxLen; i++ { var lv, rv any var lExists, rExists bool if i < len(l) { lv = l[i] lExists = true } if i < len(r) { rv = r[i] rExists = true } switch { case lExists && !rExists: diffs = append(diffs, DiffResult{ Path: fmt.Sprintf("%s[%d]", path, i), Kind: "removed", Left: lv, Right: nil, }) case !lExists && rExists: diffs = append(diffs, DiffResult{ Path: fmt.Sprintf("%s[%d]", path, i), Kind: "added", Left: nil, Right: rv, }) case lExists && rExists: diffs = append(diffs, collectDiff(fmt.Sprintf("%s[%d]", path, i), lv, rv)...) } } } else { diffs = append(diffs, DiffResult{ Path: path, Kind: "modified", Left: left, Right: right, }) } default: if fmt.Sprintf("%v", left) != fmt.Sprintf("%v", right) { diffs = append(diffs, DiffResult{ Path: path, Kind: "modified", Left: left, Right: right, }) } } return diffs } func readJSON(r io.Reader) (map[string]any, error) { var data map[string]any decoder := json.NewDecoder(r) if err := decoder.Decode(&data); err != nil { return nil, err } return data, nil } func main() { flag.StringVar(&src_file, "src", "src.json", "source file") flag.StringVar(&dst_file, "dst", "dst.json", "destination file") flag.Parse() srcFile, err := os.Open(src_file) if err != nil { fmt.Fprintf(os.Stderr, "Error opening src.json: %vn", err) return } defer srcFile.Close() dstFile, err := os.Open(dst_file) if err != nil { fmt.Fprintf(os.Stderr, "Error opening dst.json: %vn", err) return } defer dstFile.Close() srcJson, err := readJSON(srcFile) if err != nil { fmt.Fprintf(os.Stderr, "Error reading src.json: %vn", err) return } dstJson, err := readJSON(dstFile) if err != nil { fmt.Fprintf(os.Stderr, "Error reading dst.json: %vn", err) return } diffs := collectDiff("", srcJson, dstJson) if len(diffs) == 0 { fmt.Println("No differences found.") } else { fmt.Printf("%d differences found:n", len(diffs)) for _, diff := range diffs { switch diff.Kind { case "added": fmt.Printf("Added: %s: %vn", diff.Path, diff.Right) case "removed": fmt.Printf("Removed: %s: %vn", diff.Path, diff.Left) case "modified": fmt.Printf("Modified: %s: %v -> %vn", diff.Path, diff.Left, diff.Right) } } } }
运行测试,速度同样很快。
$ /usr/bin/time -f 'Elapsed Time: %e s Max RSS: %Mkbytes' ./diffjson -src ./src.json -dst ./dst.json 1 differences found: Modified: timepicker.time_options[7]: 7d -> 7dd Elapsed Time: 0.29 s Max RSS: 117468 kbytes