1
0
Fork 0
mirror of https://gitlab.com/bramw/baserow.git synced 2025-04-15 09:34:13 +00:00

Resolve "Disable number type guessing on import"

This commit is contained in:
Nigel Gott 2022-07-06 11:26:02 +00:00
parent b3c8c6fda1
commit 8ca6829346
6 changed files with 54 additions and 129 deletions
backend
src/baserow/contrib/database
management/commands
table
tests/baserow/contrib/database
changelog.md
web-frontend/modules/database/components/table

View file

@ -1,5 +1,5 @@
import sys
import random
import sys
from django.core.management.base import BaseCommand
@ -64,7 +64,14 @@ def fill_table_fields(limit, table):
# This is a helper cli command, randomness is not being used for any security
# or crypto related reasons.
field_type_name, all_kwargs = random.choice(allowed_field_list) # nosec
kwargs = random.choice(all_kwargs) # nosec
# These two kwarg types depend on another field existing, which it might
# not as we are picking randomly.
allowed_kwargs_list = [
kwargs
for kwargs in all_kwargs
if kwargs["name"] not in ["formula_singleselect", "formula_email"]
]
kwargs = random.choice(allowed_kwargs_list) # nosec
kwargs.pop("primary", None)
kwargs["name"] = field_handler.find_next_unused_field_name(
table, [kwargs["name"]]

View file

@ -1,6 +1,4 @@
from typing import Any, cast, NewType, List, Tuple, Optional, Dict
from collections import defaultdict
from decimal import Decimal
from django.conf import settings
from django.contrib.auth.models import AbstractUser
@ -9,8 +7,7 @@ from django.utils import timezone
from django.utils import translation
from django.utils.translation import gettext as _
from baserow.core.jobs.handler import JobHandler
from baserow.core.jobs.models import Job
from baserow.contrib.database.db.schema import safe_django_schema_editor
from baserow.contrib.database.fields.constants import RESERVED_BASEROW_FIELD_NAMES
from baserow.contrib.database.fields.exceptions import (
MaxFieldLimitExceeded,
@ -18,13 +15,14 @@ from baserow.contrib.database.fields.exceptions import (
ReservedBaserowFieldNameException,
InvalidBaserowFieldName,
)
from baserow.contrib.database.fields.registries import field_type_registry
from baserow.contrib.database.fields.models import Field
from baserow.contrib.database.fields.registries import field_type_registry
from baserow.contrib.database.models import Database
from baserow.contrib.database.views.handler import ViewHandler
from baserow.contrib.database.views.view_types import GridViewType
from baserow.core.jobs.handler import JobHandler
from baserow.core.jobs.models import Job
from baserow.core.trash.handler import TrashHandler
from baserow.contrib.database.db.schema import safe_django_schema_editor
from .exceptions import (
TableDoesNotExist,
TableNotInDatabase,
@ -36,7 +34,6 @@ from .exceptions import (
from .models import Table
from .signals import table_updated, table_deleted, tables_reordered
TableForUpdate = NewType("TableForUpdate", Table)
@ -353,31 +350,8 @@ class TableHandler:
normalized_data = []
# Fill missing rows and computes value type frequencies
value_type_frequencies = defaultdict(lambda: defaultdict(lambda: 0))
for row in data:
new_row = []
for index, value in enumerate(row):
if value is not None:
if isinstance(value, int):
if len(str(value)) <= 50:
value_type_frequencies[index]["number_int"] += 1
else:
value_type_frequencies[index]["text"] += 1
elif isinstance(value, float):
sign, digits, exponent = Decimal(str(value)).as_tuple()
if abs(exponent) <= 10 and len(digits) <= 50 + 10:
value_type_frequencies[index]["number_float"] += 1
else:
value_type_frequencies[index]["text"] += 1
else:
value = "" if value is None else str(value)
if len(value) > 255:
value_type_frequencies[index]["long_text"] += 1
else:
value_type_frequencies[index]["text"] += 1
new_row.append(value)
new_row = list(row)
# Fill incomplete rows with empty values
for i in range(len(new_row), largest_column_count):
@ -386,38 +360,8 @@ class TableHandler:
normalized_data.append(new_row)
field_with_type = []
tolerated_error_threshold = (
len(normalized_data)
/ 100
* settings.BASEROW_IMPORT_TOLERATED_TYPE_ERROR_THRESHOLD
)
# Try to guess field type from type frequency
for index, field_name in enumerate(fields):
frequencies = value_type_frequencies[index]
if frequencies["long_text"] > tolerated_error_threshold:
field_with_type.append(
(field_name, "long_text", {"field_options": {"width": 400}})
)
elif frequencies["text"] > tolerated_error_threshold:
field_with_type.append((field_name, "text", {}))
elif frequencies["number_float"] > tolerated_error_threshold:
field_with_type.append(
(
field_name,
"number",
{"number_negative": True, "number_decimal_places": 10},
)
)
elif frequencies["number_int"] > tolerated_error_threshold:
field_with_type.append(
(
field_name,
"number",
{"number_negative": True, "field_options": {"width": 150}},
)
)
else:
field_with_type.append((field_name, "text", {}))
field_with_type.append((field_name, "text", {}))
return field_with_type, normalized_data

View file

@ -1,5 +1,6 @@
import pytest
from django.conf import settings
from pytest_unordered import unordered
from baserow.contrib.database.fields.dependencies.circular_reference_checker import (
will_cause_circular_dep,
@ -294,7 +295,7 @@ def test_get_dependant_fields_with_type(data_fixture):
(text_field_1_dependency_2, text_field_type, None),
(text_field_1_and_2_dependency_1, text_field_type, None),
]
assert results == expected_text_field_1_dependants
assert results == unordered(expected_text_field_1_dependants)
results = FieldDependencyHandler.get_dependant_fields_with_type(
table.id,
@ -312,7 +313,7 @@ def test_get_dependant_fields_with_type(data_fixture):
),
(text_field_1_and_2_dependency_1, text_field_type, None),
]
assert results == expected_text_field_2_dependants
assert results == unordered(expected_text_field_2_dependants)
results = FieldDependencyHandler.get_dependant_fields_with_type(
table.id,
@ -328,6 +329,6 @@ def test_get_dependant_fields_with_type(data_fixture):
associated_relations_changed=True,
field_cache=field_cache,
)
assert (
results == expected_text_field_1_dependants + expected_text_field_2_dependants
assert results == unordered(
expected_text_field_1_dependants + expected_text_field_2_dependants
)

View file

@ -1,39 +1,37 @@
import pytest
from pyinstrument import Profiler
from baserow.core.exceptions import UserNotInGroup
from freezegun import freeze_time
from decimal import Decimal
from django.utils import timezone
from django.conf import settings
from django.test.utils import override_settings
from django.utils import timezone
from freezegun import freeze_time
from pyinstrument import Profiler
from baserow.contrib.database.fields.models import SelectOption
from baserow.contrib.database.fields.field_cache import FieldCache
from baserow.contrib.database.fields.dependencies.handler import FieldDependencyHandler
from baserow.contrib.database.fields.models import TextField, LongTextField, NumberField
from baserow.core.jobs.tasks import run_async_job, clean_up_jobs
from baserow.core.jobs.models import Job
from baserow.core.jobs.constants import (
JOB_FAILED,
JOB_FINISHED,
JOB_STARTED,
JOB_PENDING,
)
from baserow.contrib.database.file_import.exceptions import (
FileImportMaxErrorCountExceeded,
)
from baserow.contrib.database.fields.exceptions import (
MaxFieldLimitExceeded,
MaxFieldNameLengthExceeded,
ReservedBaserowFieldNameException,
InvalidBaserowFieldName,
)
from baserow.contrib.database.fields.field_cache import FieldCache
from baserow.contrib.database.fields.models import SelectOption
from baserow.contrib.database.fields.models import TextField
from baserow.contrib.database.file_import.exceptions import (
FileImportMaxErrorCountExceeded,
)
from baserow.contrib.database.table.exceptions import (
InvalidInitialTableData,
InitialTableDataLimitExceeded,
InitialTableDataDuplicateName,
)
from baserow.core.exceptions import UserNotInGroup
from baserow.core.jobs.constants import (
JOB_FAILED,
JOB_FINISHED,
JOB_STARTED,
JOB_PENDING,
)
from baserow.core.jobs.models import Job
from baserow.core.jobs.tasks import run_async_job, clean_up_jobs
@pytest.mark.django_db(transaction=True)
@ -225,14 +223,12 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
field1, field2, field3, field4, field5 = fields
assert isinstance(field1.specific, TextField)
assert isinstance(field2.specific, NumberField)
assert field2.specific.number_decimal_places == 10
assert isinstance(field3.specific, NumberField)
assert field3.specific.number_decimal_places == 10
assert isinstance(field4.specific, LongTextField)
assert isinstance(field5.specific, LongTextField)
assert isinstance(field2.specific, TextField)
assert isinstance(field3.specific, TextField)
assert isinstance(field4.specific, TextField)
assert isinstance(field5.specific, TextField)
assert getattr(rows[0], field3.db_column) == Decimal("1.55")
assert getattr(rows[0], field3.db_column) == "1.55"
# Test type guessing with error threshold
with override_settings(
@ -246,7 +242,7 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
table = job.table
model = table.get_model()
rows = model.objects.all()
assert len(rows) == 19
assert len(rows) == 20
fields = table.field_set.all()
@ -255,11 +251,9 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
field1, field2, field3, field4, field5 = fields
assert isinstance(field1.specific, TextField)
assert isinstance(field2.specific, NumberField)
assert field2.specific.number_decimal_places == 0
assert isinstance(field3.specific, NumberField)
assert field3.specific.number_decimal_places == 10
assert isinstance(field4.specific, LongTextField)
assert isinstance(field2.specific, TextField)
assert isinstance(field3.specific, TextField)
assert isinstance(field4.specific, TextField)
assert isinstance(field5.specific, TextField)
# Import data to an existing table
@ -272,7 +266,7 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
run_async_job(job.id)
rows = model.objects.all()
assert len(rows) == 21
assert len(rows) == 22
# Import data with error
data = [
@ -291,27 +285,9 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
job.refresh_from_db()
rows = model.objects.all()
assert len(rows) == 23
assert len(rows) == 26
assert sorted(job.report["failing_rows"].keys()) == ["0", "3"]
assert job.report["failing_rows"]["0"] == {
f"field_{field2.id}": [
{"code": "invalid", "error": "A valid number is required."}
],
f"field_{field3.id}": [
{"code": "invalid", "error": "A valid number is required."}
],
}
assert job.report["failing_rows"]["3"] == {
f"field_{field2.id}": [
{
"code": "max_decimal_places",
"error": "Ensure that there are no more than 0 decimal places.",
}
]
}
assert sorted(job.report["failing_rows"].keys()) == []
# Change user language to test message i18n
job.user.profile.language = "fr"
@ -327,12 +303,7 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
job.refresh_from_db()
assert job.report["failing_rows"]["0"][f"field_{field2.id}"] == [
{
"code": "invalid",
"error": "Un nombre valide est requis.",
}
]
assert job.report["failing_rows"] == {}
data = [
[1.3],

View file

@ -13,6 +13,8 @@ For example:
### Bug Fixes
* Disable table import field type guessing and instead always import as text fields. [#1050](https://gitlab.com/bramw/baserow/-/issues/1050)
### Breaking Changes

View file

@ -224,7 +224,7 @@ export default {
header.forEach((key) => {
const exists = Object.prototype.hasOwnProperty.call(entry, key)
const value = exists ? entry[key].toString() : ''
const value = exists ? JSON.stringify(entry[key]) : ''
row.push(value)
})