Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 40 additions & 7 deletions csv_diff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,48 @@ def load_csv(fp, key=None, dialect=None):
# Oh well, we tried. Fallback to the default.
pass
fp = csv.reader(fp, dialect=(dialect or "excel"))
headings = next(fp)
rows = [dict(zip(headings, line)) for line in fp]
try:
headings = next(fp)
except StopIteration:
raise ValueError("CSV input is empty (no header row found)")
if not headings:
raise ValueError("CSV input has an empty header row")
rows = {}
# Track the 1-based source line number alongside each row so that any
# downstream KeyError or value-shape error can point back to the line
# in the input file the user just gave us. The header is on line 1.
for line_number, line in enumerate(fp, start=2):
# csv.reader yields an empty list for a fully-blank line (a stray
# trailing newline, the kind GitHub and most editors insert by
# default). Silently skipping those matches the "POSIX text file"
# convention and the behaviour of most other CSV tools; raising
# KeyError('a') at the very end of a diff made the tool look
# broken on perfectly normal input. See issue #29.
if not line:
continue
if len(line) < len(headings):
raise ValueError(
f"CSV row on line {line_number} has {len(line)} field(s) "
f"but the header on line 1 has {len(headings)}; "
f"got {line!r}"
)
rows[line_number] = dict(zip(headings, line))
if key:
keyfn = lambda r: r[key]
try:
return {rows[ln][key]: rows[ln] for ln in rows}
except KeyError as exc:
missing = exc.args[0]
raise ValueError(
f"Key column {missing!r} not present in CSV header "
f"{headings!r}"
) from None
else:
keyfn = lambda r: hashlib.sha1(
json.dumps(r, sort_keys=True).encode("utf8")
).hexdigest()
return {keyfn(r): r for r in rows}
return {
hashlib.sha1(
json.dumps(rows[ln], sort_keys=True).encode("utf8")
).hexdigest(): rows[ln]
for ln in rows
}


def load_json(fp, key=None):
Expand Down
36 changes: 36 additions & 0 deletions tests/test_csv_diff.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from csv_diff import load_csv, compare
import io
import pytest

ONE = """id,name,age
1,Cleo,4
Expand Down Expand Up @@ -115,3 +116,38 @@ def test_tsv():
"columns_added": [],
"columns_removed": [],
} == diff


def test_trailing_blank_line_ignored():
# Issue #29: a trailing newline (as GitHub and most editors emit) should
# not crash the tool with a KeyError on the key column.
csv_text = "a,b,c\n1,2,3\n\n"
assert load_csv(io.StringIO(csv_text), key="a") == {
"1": {"a": "1", "b": "2", "c": "3"}
}


def test_interior_blank_line_ignored():
csv_text = "a,b,c\n1,2,3\n\n4,5,6\n"
assert load_csv(io.StringIO(csv_text), key="a") == {
"1": {"a": "1", "b": "2", "c": "3"},
"4": {"a": "4", "b": "5", "c": "6"},
}


def test_trailing_blank_line_with_no_key():
csv_text = "a,b,c\n1,2,3\n\n"
loaded = load_csv(io.StringIO(csv_text))
assert list(loaded.values()) == [{"a": "1", "b": "2", "c": "3"}]


def test_mismatched_row_length_raises_clear_error():
csv_text = "a,b,c\n1,2,3\n4,5\n"
with pytest.raises(ValueError, match=r"line 3.*2 field.*3"):
load_csv(io.StringIO(csv_text), key="a")


def test_missing_key_column_raises_clear_error():
csv_text = "a,b,c\n1,2,3\n"
with pytest.raises(ValueError, match=r"Key column 'z' not present"):
load_csv(io.StringIO(csv_text), key="z")