Resolve "Disable number type guessing on import"

2025-04-15 09:34:13 +00:00 · 2022-07-06 11:26:02 +00:00 · 2022-07-06 11:26:02 +00:00 · 8ca6829346
commit 8ca6829346
parent b3c8c6fda1
6 changed files with 54 additions and 129 deletions
--- a/backend/src/baserow/contrib/database/management/commands/fill_table_fields.py
+++ b/backend/src/baserow/contrib/database/management/commands/fill_table_fields.py
@ -1,5 +1,5 @@
-import sys
 import random
+import sys

 from django.core.management.base import BaseCommand

@ -64,7 +64,14 @@ def fill_table_fields(limit, table):
        # This is a helper cli command, randomness is not being used for any security
        # or crypto related reasons.
        field_type_name, all_kwargs = random.choice(allowed_field_list)  # nosec
-        kwargs = random.choice(all_kwargs)  # nosec
+        # These two kwarg types depend on another field existing, which it might
+        # not as we are picking randomly.
+        allowed_kwargs_list = [
+            kwargs
+            for kwargs in all_kwargs
+            if kwargs["name"] not in ["formula_singleselect", "formula_email"]
+        ]
+        kwargs = random.choice(allowed_kwargs_list)  # nosec
        kwargs.pop("primary", None)
        kwargs["name"] = field_handler.find_next_unused_field_name(
            table, [kwargs["name"]]
--- a/backend/src/baserow/contrib/database/table/handler.py
+++ b/backend/src/baserow/contrib/database/table/handler.py
@ -1,6 +1,4 @@
 from typing import Any, cast, NewType, List, Tuple, Optional, Dict
-from collections import defaultdict
-from decimal import Decimal

 from django.conf import settings
 from django.contrib.auth.models import AbstractUser
@ -9,8 +7,7 @@ from django.utils import timezone
 from django.utils import translation
 from django.utils.translation import gettext as _

-from baserow.core.jobs.handler import JobHandler
-from baserow.core.jobs.models import Job
+from baserow.contrib.database.db.schema import safe_django_schema_editor
 from baserow.contrib.database.fields.constants import RESERVED_BASEROW_FIELD_NAMES
 from baserow.contrib.database.fields.exceptions import (
    MaxFieldLimitExceeded,
@ -18,13 +15,14 @@ from baserow.contrib.database.fields.exceptions import (
    ReservedBaserowFieldNameException,
    InvalidBaserowFieldName,
 )
-from baserow.contrib.database.fields.registries import field_type_registry
 from baserow.contrib.database.fields.models import Field
+from baserow.contrib.database.fields.registries import field_type_registry
 from baserow.contrib.database.models import Database
 from baserow.contrib.database.views.handler import ViewHandler
 from baserow.contrib.database.views.view_types import GridViewType
+from baserow.core.jobs.handler import JobHandler
+from baserow.core.jobs.models import Job
 from baserow.core.trash.handler import TrashHandler
-from baserow.contrib.database.db.schema import safe_django_schema_editor
 from .exceptions import (
    TableDoesNotExist,
    TableNotInDatabase,
@ -36,7 +34,6 @@ from .exceptions import (
 from .models import Table
 from .signals import table_updated, table_deleted, tables_reordered

-
 TableForUpdate = NewType("TableForUpdate", Table)


@ -353,31 +350,8 @@ class TableHandler:

        normalized_data = []

-        # Fill missing rows and computes value type frequencies
-        value_type_frequencies = defaultdict(lambda: defaultdict(lambda: 0))
        for row in data:
-            new_row = []
-            for index, value in enumerate(row):
-                if value is not None:
-                    if isinstance(value, int):
-                        if len(str(value)) <= 50:
-                            value_type_frequencies[index]["number_int"] += 1
-                        else:
-                            value_type_frequencies[index]["text"] += 1
-                    elif isinstance(value, float):
-                        sign, digits, exponent = Decimal(str(value)).as_tuple()
-                        if abs(exponent) <= 10 and len(digits) <= 50 + 10:
-                            value_type_frequencies[index]["number_float"] += 1
-                        else:
-                            value_type_frequencies[index]["text"] += 1
-                    else:
-                        value = "" if value is None else str(value)
-                        if len(value) > 255:
-                            value_type_frequencies[index]["long_text"] += 1
-                        else:
-                            value_type_frequencies[index]["text"] += 1
-
-                new_row.append(value)
+            new_row = list(row)

            # Fill incomplete rows with empty values
            for i in range(len(new_row), largest_column_count):
@ -386,38 +360,8 @@ class TableHandler:
            normalized_data.append(new_row)

        field_with_type = []
-        tolerated_error_threshold = (
-            len(normalized_data)
-            / 100
-            * settings.BASEROW_IMPORT_TOLERATED_TYPE_ERROR_THRESHOLD
-        )
-        # Try to guess field type from type frequency
        for index, field_name in enumerate(fields):
-            frequencies = value_type_frequencies[index]
-            if frequencies["long_text"] > tolerated_error_threshold:
-                field_with_type.append(
-                    (field_name, "long_text", {"field_options": {"width": 400}})
-                )
-            elif frequencies["text"] > tolerated_error_threshold:
-                field_with_type.append((field_name, "text", {}))
-            elif frequencies["number_float"] > tolerated_error_threshold:
-                field_with_type.append(
-                    (
-                        field_name,
-                        "number",
-                        {"number_negative": True, "number_decimal_places": 10},
-                    )
-                )
-            elif frequencies["number_int"] > tolerated_error_threshold:
-                field_with_type.append(
-                    (
-                        field_name,
-                        "number",
-                        {"number_negative": True, "field_options": {"width": 150}},
-                    )
-                )
-            else:
-                field_with_type.append((field_name, "text", {}))
+            field_with_type.append((field_name, "text", {}))

        return field_with_type, normalized_data

--- a/backend/tests/baserow/contrib/database/field/dependencies/test_field_dependency_handler.py
+++ b/backend/tests/baserow/contrib/database/field/dependencies/test_field_dependency_handler.py
@ -1,5 +1,6 @@
 import pytest
 from django.conf import settings
+from pytest_unordered import unordered

 from baserow.contrib.database.fields.dependencies.circular_reference_checker import (
    will_cause_circular_dep,
@ -294,7 +295,7 @@ def test_get_dependant_fields_with_type(data_fixture):
        (text_field_1_dependency_2, text_field_type, None),
        (text_field_1_and_2_dependency_1, text_field_type, None),
    ]
-    assert results == expected_text_field_1_dependants
+    assert results == unordered(expected_text_field_1_dependants)

    results = FieldDependencyHandler.get_dependant_fields_with_type(
        table.id,
@ -312,7 +313,7 @@ def test_get_dependant_fields_with_type(data_fixture):
        ),
        (text_field_1_and_2_dependency_1, text_field_type, None),
    ]
-    assert results == expected_text_field_2_dependants
+    assert results == unordered(expected_text_field_2_dependants)

    results = FieldDependencyHandler.get_dependant_fields_with_type(
        table.id,
@ -328,6 +329,6 @@ def test_get_dependant_fields_with_type(data_fixture):
        associated_relations_changed=True,
        field_cache=field_cache,
    )
-    assert (
-        results == expected_text_field_1_dependants + expected_text_field_2_dependants
+    assert results == unordered(
+        expected_text_field_1_dependants + expected_text_field_2_dependants
    )
--- a/backend/tests/baserow/contrib/database/file_import/test_file_import_tasks.py
+++ b/backend/tests/baserow/contrib/database/file_import/test_file_import_tasks.py
@ -1,39 +1,37 @@
 import pytest
-from pyinstrument import Profiler
-from baserow.core.exceptions import UserNotInGroup
-from freezegun import freeze_time
-from decimal import Decimal
-
-from django.utils import timezone
 from django.conf import settings
 from django.test.utils import override_settings
+from django.utils import timezone
+from freezegun import freeze_time
+from pyinstrument import Profiler

-from baserow.contrib.database.fields.models import SelectOption
-from baserow.contrib.database.fields.field_cache import FieldCache
 from baserow.contrib.database.fields.dependencies.handler import FieldDependencyHandler
-from baserow.contrib.database.fields.models import TextField, LongTextField, NumberField
-from baserow.core.jobs.tasks import run_async_job, clean_up_jobs
-from baserow.core.jobs.models import Job
-from baserow.core.jobs.constants import (
-    JOB_FAILED,
-    JOB_FINISHED,
-    JOB_STARTED,
-    JOB_PENDING,
-)
-from baserow.contrib.database.file_import.exceptions import (
-    FileImportMaxErrorCountExceeded,
-)
 from baserow.contrib.database.fields.exceptions import (
    MaxFieldLimitExceeded,
    MaxFieldNameLengthExceeded,
    ReservedBaserowFieldNameException,
    InvalidBaserowFieldName,
 )
+from baserow.contrib.database.fields.field_cache import FieldCache
+from baserow.contrib.database.fields.models import SelectOption
+from baserow.contrib.database.fields.models import TextField
+from baserow.contrib.database.file_import.exceptions import (
+    FileImportMaxErrorCountExceeded,
+)
 from baserow.contrib.database.table.exceptions import (
    InvalidInitialTableData,
    InitialTableDataLimitExceeded,
    InitialTableDataDuplicateName,
 )
+from baserow.core.exceptions import UserNotInGroup
+from baserow.core.jobs.constants import (
+    JOB_FAILED,
+    JOB_FINISHED,
+    JOB_STARTED,
+    JOB_PENDING,
+)
+from baserow.core.jobs.models import Job
+from baserow.core.jobs.tasks import run_async_job, clean_up_jobs


@pytest.mark.django_db(transaction=True)
@ -225,14 +223,12 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
    field1, field2, field3, field4, field5 = fields

    assert isinstance(field1.specific, TextField)
-    assert isinstance(field2.specific, NumberField)
-    assert field2.specific.number_decimal_places == 10
-    assert isinstance(field3.specific, NumberField)
-    assert field3.specific.number_decimal_places == 10
-    assert isinstance(field4.specific, LongTextField)
-    assert isinstance(field5.specific, LongTextField)
+    assert isinstance(field2.specific, TextField)
+    assert isinstance(field3.specific, TextField)
+    assert isinstance(field4.specific, TextField)
+    assert isinstance(field5.specific, TextField)

-    assert getattr(rows[0], field3.db_column) == Decimal("1.55")
+    assert getattr(rows[0], field3.db_column) == "1.55"

    # Test type guessing with error threshold
    with override_settings(
@ -246,7 +242,7 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
    table = job.table
    model = table.get_model()
    rows = model.objects.all()
-    assert len(rows) == 19
+    assert len(rows) == 20

    fields = table.field_set.all()

@ -255,11 +251,9 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
    field1, field2, field3, field4, field5 = fields

    assert isinstance(field1.specific, TextField)
-    assert isinstance(field2.specific, NumberField)
-    assert field2.specific.number_decimal_places == 0
-    assert isinstance(field3.specific, NumberField)
-    assert field3.specific.number_decimal_places == 10
-    assert isinstance(field4.specific, LongTextField)
+    assert isinstance(field2.specific, TextField)
+    assert isinstance(field3.specific, TextField)
+    assert isinstance(field4.specific, TextField)
    assert isinstance(field5.specific, TextField)

    # Import data to an existing table
@ -272,7 +266,7 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
        run_async_job(job.id)

    rows = model.objects.all()
-    assert len(rows) == 21
+    assert len(rows) == 22

    # Import data with error
    data = [
@ -291,27 +285,9 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):
    job.refresh_from_db()

    rows = model.objects.all()
-    assert len(rows) == 23
+    assert len(rows) == 26

-    assert sorted(job.report["failing_rows"].keys()) == ["0", "3"]
-
-    assert job.report["failing_rows"]["0"] == {
-        f"field_{field2.id}": [
-            {"code": "invalid", "error": "A valid number is required."}
-        ],
-        f"field_{field3.id}": [
-            {"code": "invalid", "error": "A valid number is required."}
-        ],
-    }
-
-    assert job.report["failing_rows"]["3"] == {
-        f"field_{field2.id}": [
-            {
-                "code": "max_decimal_places",
-                "error": "Ensure that there are no more than 0 decimal places.",
-            }
-        ]
-    }
+    assert sorted(job.report["failing_rows"].keys()) == []

    # Change user language to test message i18n
    job.user.profile.language = "fr"
@ -327,12 +303,7 @@ def test_run_file_import_task(data_fixture, patch_filefield_storage):

    job.refresh_from_db()

-    assert job.report["failing_rows"]["0"][f"field_{field2.id}"] == [
-        {
-            "code": "invalid",
-            "error": "Un nombre valide est requis.",
-        }
-    ]
+    assert job.report["failing_rows"] == {}

    data = [
        [1.3],
--- a/changelog.md
+++ b/changelog.md
@ -13,6 +13,8 @@ For example:

 ### Bug Fixes

+* Disable table import field type guessing and instead always import as text fields. [#1050](https://gitlab.com/bramw/baserow/-/issues/1050)
+
 ### Breaking Changes


--- a/web-frontend/modules/database/components/table/TableJSONImporter.vue
+++ b/web-frontend/modules/database/components/table/TableJSONImporter.vue
@ -224,7 +224,7 @@ export default {

        header.forEach((key) => {
          const exists = Object.prototype.hasOwnProperty.call(entry, key)
-          const value = exists ? entry[key].toString() : ''
+          const value = exists ? JSON.stringify(entry[key]) : ''
          row.push(value)
        })