Skip to content

Commit

Permalink
fix: use defusedxml for xml parsing, only parse one in proj creation (#…
Browse files Browse the repository at this point in the history
…1316)

* build: add defusedxml to prevent injection attacks on xml upload

* fix(backend): only load the xlsform once, then pass to task creation

* test: fix project creation test to match xform update

* fix(frontend): update error message parsing if form is not valid
  • Loading branch information
spwoodcock authored Mar 2, 2024
1 parent 12ab56a commit 3d26f8d
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 126 deletions.
191 changes: 82 additions & 109 deletions src/backend/app/central/central_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@
from asyncio import gather
from io import BytesIO
from pathlib import Path
from typing import Optional
from xml.etree import ElementTree
from typing import Optional, Union

from defusedxml import ElementTree
from fastapi import HTTPException
from fastapi.responses import JSONResponse
from loguru import logger as log
from osm_fieldwork.CSVDump import CSVDump
from osm_fieldwork.OdkCentral import OdkAppUser, OdkForm, OdkProject
Expand Down Expand Up @@ -391,9 +390,11 @@ async def update_and_publish_form(
odk_credentials (project_schemas.ODKCentralDecrypted): ODK Central creds.
"""
odk_form_name = f"{form_name_prefix}_{task_id}"
updated_xform_data = update_xform_info(
xform_data = await read_and_test_xform(
xform_data, form_file_ext, return_form_data=True
)
updated_xform_data = await update_xform_info(
xform_data,
form_file_ext,
odk_form_name,
f"{odk_form_name}.geojson",
)
Expand Down Expand Up @@ -431,53 +432,101 @@ def download_submissions(
return fixed.splitlines()


async def test_form_validity(xform_content: bytes, form_file_ext: str):
"""Validate an XForm.
async def read_and_test_xform(
input_data: BytesIO,
form_file_ext: str,
return_form_data: bool = False,
) -> Union[BytesIO, dict]:
"""Read and validate an XForm.
Args:
xform_content (str): form to be tested
input_data (BytesIO): form to be tested.
form_file_ext (str): type of form (.xls, .xlsx, or .xml).
return_form_data (bool): return the XForm data.
"""
try:
if form_file_ext == ".xml":
# Write xform_content to a temporary file
with open(f"/tmp/xform_temp{form_file_ext}", "wb") as f:
f.write(xform_content)
else:
with open(f"/tmp/xlsform{form_file_ext}", "wb") as f:
f.write(xform_content)
# Convert XLSForm to XForm
# TODO xls2xform_convert requires files on disk
# TODO create PR to accept BytesIO?

# Read from BytesIO object
input_content = input_data.getvalue()
file_ext = form_file_ext.lower()

input_path = Path(f"/tmp/fmtm_form_input_tmp{file_ext}")
# This file will store xml contents of an xls form
# NOTE a file on disk is required by xls2xform_convert
output_path = Path("/tmp/fmtm_xform_temp.xml")

if file_ext == ".xml":
# Create output file to write to
output_path.touch(exist_ok=True)
# Write input_content to a temporary file
with open(output_path, "wb") as f:
f.write(input_content)
else:
# Create input file to write to
input_path.touch(exist_ok=True)
with open(input_path, "wb") as f:
f.write(input_content)
try:
log.debug(f"Converting xlsform -> xform: {str(output_path)}")
# FIXME should this be validate=True?
xls2xform_convert(
xlsform_path="/tmp/xlsform.xls",
xform_path="/tmp/xform_temp.xml",
xlsform_path=str(input_path),
xform_path=str(output_path),
validate=False,
)
except Exception as e:
log.error(e)
msg = f"XLSForm is invalid. Possible reason: {str(e)}"
raise HTTPException(
status_code=HTTPStatus.UNPROCESSABLE_ENTITY, detail=msg
) from e

# Parse XForm
namespaces = {"xforms": "http://www.w3.org/2002/xforms"}
tree = ElementTree.parse("/tmp/xform_temp.xml")
root = tree.getroot()
# Parse XForm
try:
# TODO for memory object use ElementTree.fromstring()
xml_parsed = ElementTree.parse(str(output_path))
if return_form_data:
xml_bytes = ElementTree.tostring(xml_parsed.getroot())
return BytesIO(xml_bytes)
except ElementTree.ParseError as e:
log.error(e)
msg = f"Error parsing XForm XML: Possible reason: {str(e)}"
raise HTTPException(
status_code=HTTPStatus.UNPROCESSABLE_ENTITY, detail=msg
) from e

# Delete temp files
if file_ext != ".xml":
input_path.unlink()
output_path.unlink()

# Extract geojson filenames
if return_form_data:
return xml_parsed

# Extract geojson filenames
try:
root = xml_parsed.getroot()
namespaces = {"xforms": "http://www.w3.org/2002/xforms"}
geojson_list = [
os.path.splitext(inst.attrib["src"].split("/")[-1])[0]
for inst in root.findall(".//xforms:instance[@src]", namespaces)
if inst.attrib.get("src", "").endswith(".geojson")
]

return {"required media": geojson_list, "message": "Your form is valid"}
return {"required_media": geojson_list, "message": "Your form is valid"}

except Exception as e:
return JSONResponse(
content={"message": "Your form is invalid", "possible_reason": str(e)},
status_code=400,
)
log.error(e)
msg = f"Error extracting geojson filename: Possible reason: {str(e)}"
raise HTTPException(
status_code=HTTPStatus.UNPROCESSABLE_ENTITY, detail=msg
) from e


def update_xform_info(
async def update_xform_info(
form_data: BytesIO,
form_file_ext: str,
form_name,
form_name: str,
geojson_file_name: str,
) -> BytesIO:
"""Update fields in the XForm to work with FMTM.
Expand All @@ -494,82 +543,6 @@ def update_xform_info(
Returns:
BytesIO: The XForm data.
"""
# TODO xls2xform_convert requires files on disk
# TODO create PR to accept BytesIO?
form_path = Path(f"/tmp/fmtm_form_input_tmp{form_file_ext}")
with open(form_path, "wb") as f:
f.write(form_data.getvalue())

form_file_extension = form_path.suffix.lower()
# This file will store xml contents of an xls form
# NOTE a file on disk is required by xls2xform_convert
xform_path = Path("/tmp/fmtm_xform_tmp.xml")

if form_file_extension != ".xml":
try:
log.debug(f"Converting xlsform -> xform: {str(form_path)}")
xls2xform_convert(
xlsform_path=str(form_path), xform_path=str(xform_path), validate=False
)
except Exception as e:
log.error(e)
msg = f"Couldn't convert {str(form_path)} to an XForm!"
log.error(msg)
raise HTTPException(status_code=400, detail=msg) from e

if xform_path.stat().st_size <= 0:
log.warning(f"{str(xform_path)} is empty!")
raise HTTPException(
status_code=400, detail=f"{str(xform_path)} is empty!"
) from None

with open(xform_path, "r") as xform:
data = xform.read()
# Delete temp from output
xform_path.unlink()
else:
with open(form_path, "r") as xlsform:
log.debug(f"Reading XForm directly: {str(form_path)}")
data = xlsform.read()

# Delete temp form input
form_path.unlink()

# # Parse the XML to geojson
# xml = xmltodict.parse(str(data))

# # First change the osm data extract file
# index = 0
# for inst in xml["h:html"]["h:head"]["model"]["instance"]:
# try:
# if "@src" in inst:
# if (
# xml["h:html"]["h:head"]["model"]["instance"][index] \
# ["@src"].split(
# "."
# )[1]
# == "geojson"
# ):
# xml["h:html"]["h:head"]["model"]["instance"][index][
# "@src"
# ] = extract

# if "data" in inst:
# print("data in inst")
# if "data" == inst:
# print("Data = inst ", inst)
# xml["h:html"]["h:head"]["model"]["instance"]["data"] \
# ["@id"] = id
# # xml["h:html"]["h:head"]["model"]["instance"]["data"] \
# # ["@id"] = xform
# else:
# xml["h:html"]["h:head"]["model"]["instance"][0]["data"] \
# ["@id"] = id
# except Exception:
# continue
# index += 1
# xml["h:html"]["h:head"]["h:title"] = name

log.debug("Updating XML keys in XForm with data extract file & form id")

# Namespaces definition
Expand All @@ -580,7 +553,7 @@ def update_xform_info(
}

# Parse the XML
root = ElementTree.fromstring(data)
root = ElementTree.fromstring(form_data.getvalue())

# Update id attribute to equal the form name to be generated
xform_data = root.findall(".//xforms:data[@id]", namespaces)
Expand Down
18 changes: 10 additions & 8 deletions src/backend/app/projects/project_crud.py
Original file line number Diff line number Diff line change
Expand Up @@ -1107,8 +1107,7 @@ def generate_task_files(
project: db_models.DbProject,
task_id: int,
data_extract: FeatureCollection,
xlsform_data: BytesIO,
form_file_ext: str,
xform_data: BytesIO,
odk_credentials: project_schemas.ODKCentralDecrypted,
):
"""Generate all files for a task."""
Expand Down Expand Up @@ -1157,9 +1156,9 @@ def generate_task_files(
project_log.info(f"Generating xform from for task: ({task_id})")

# This is where the ODK form name and geojson media name are set
xform_data = central_crud.update_xform_info(
xlsform_data,
form_file_ext,
update_xform_sync = async_to_sync(central_crud.update_xform_info)
updated_xform = update_xform_sync(
xform_data,
form_name,
f"{form_name}.geojson",
)
Expand All @@ -1168,7 +1167,7 @@ def generate_task_files(
project_log.info(f"Uploading data extract media to task ({task_id})")
xform_name = central_crud.create_odk_xform(
odk_id,
xform_data,
updated_xform,
f"{form_name}.geojson",
geojson_data,
odk_credentials,
Expand Down Expand Up @@ -1283,6 +1282,10 @@ def generate_project_files(
detail="Failed splitting extract by tasks.",
)

# Convert XLSForm --> XForm for all tasks
read_xform_sync = async_to_sync(central_crud.read_and_test_xform)
xform_data = read_xform_sync(xlsform, form_file_ext, return_form_data=True)

# Run with expensive task via threadpool
def wrap_generate_task_files(task_id):
"""Func to wrap and return errors from thread.
Expand All @@ -1297,8 +1300,7 @@ def wrap_generate_task_files(task_id):
project,
task_id,
task_extract_dict[task_id],
xlsform,
form_file_ext,
xform_data,
odk_credentials,
)
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion src/backend/app/projects/project_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,7 +550,7 @@ async def validate_form(form: UploadFile):
)

contents = await form.read()
return await central_crud.test_form_validity(contents, file_ext)
return await central_crud.read_and_test_xform(BytesIO(contents), file_ext)


@router.post("/{project_id}/generate-project-data")
Expand Down
2 changes: 1 addition & 1 deletion src/backend/pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies = [
"asgiref==3.7.2",
"sozipfile==0.3.2",
"cryptography>=42.0.1",
"defusedxml>=0.7.1",
"osm-login-python==1.0.1",
"osm-fieldwork==0.5.1",
"osm-rawdata==0.2.3",
Expand Down
10 changes: 7 additions & 3 deletions src/backend/tests/test_projects_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
from loguru import logger as log
from shapely import Polygon

from app.central.central_crud import create_odk_project
from app.central.central_crud import create_odk_project, read_and_test_xform
from app.config import encrypt_value, settings
from app.db import db_models
from app.db.postgis_utils import split_geojson_by_task_areas
Expand Down Expand Up @@ -275,6 +275,11 @@ async def test_generate_project_files(db, client, project):
with open(xlsform_file, "rb") as xlsform_data:
xlsform_obj = BytesIO(xlsform_data.read())

# Convert XLSForm --> XForm for all tasks
xform_data = await read_and_test_xform(
xlsform_obj, xlsform_file.suffix.lower(), return_form_data=True
)

for task_id in split_extract_dict.keys():
# NOTE avoid the lambda function for run_in_threadpool
# functools.partial captures the loop variable task_id in a
Expand All @@ -286,8 +291,7 @@ async def test_generate_project_files(db, client, project):
project,
task_id,
split_extract_dict[task_id],
xlsform_obj,
xlsform_file.suffix.lower(),
xform_data,
odk_credentials,
)
)
Expand Down
6 changes: 2 additions & 4 deletions src/frontend/src/api/CreateProjectService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -462,11 +462,9 @@ const ValidateCustomForm: Function = (url: string, formUpload: any) => {
dispatch(
CommonActions.SetSnackBar({
open: true,
message:
JSON.stringify(`${error.response.data.message}, ${error.response.data.possible_reason}`) ||
'Something Went Wrong',
message: JSON.stringify(error.response.data.detail) || 'Something Went Wrong',
variant: 'error',
duration: 2000,
duration: 5000,
}),
);
dispatch(CreateProjectActions.ValidateCustomFormLoading(false));
Expand Down

0 comments on commit 3d26f8d

Please sign in to comment.