Sharepoint reader

Module to define the behaviour to read from Sharepoint.

`SharepointCsvReader` ¶

Bases: SharepointReader

Read CSV files from Sharepoint and return Spark DataFrame.

Supports reading a single file or combining multiple files in a folder. Ensures schema consistency and archives processed files.

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

class SharepointCsvReader(SharepointReader):
    """Read CSV files from Sharepoint and return Spark DataFrame.

    Supports reading a single file or combining multiple files in a folder.
    Ensures schema consistency and archives processed files.
    """

    def read(self, file_path: str = None, pattern: str = None) -> DataFrame:
        """Read CSV data from Sharepoint.

        Args:
            file_path: Full file or folder path (overrides options if provided).
            pattern: Optional substring filter for folder mode.

        Returns:
            Spark DataFrame.

        Raises:
            ValueError: Invalid/missing path or path not found.
        """
        file_path = file_path or self.file_path
        pattern = pattern or self.pattern

        if not file_path:
            raise ValueError(
                """`file_name` or `folder_relative_path` must be provided via
                sharepoint_opts."""
            )

        # Case 1: file_path includes a file (e.g., folder/file.csv or full path)
        if "." in Path(file_path).name:
            sp_file = self.sharepoint_utils.get_file_metadata(file_path)
            _LOGGER.info(f"Detected single-file read mode for '{file_path}'.")
            return self._load_and_archive_file(sp_file)

        # Case 2: it's a folder — use optional pattern
        if not self.sharepoint_utils.check_if_endpoint_exists(file_path):
            raise ValueError(f"Folder '{file_path}' does not exist in Sharepoint.")

        _LOGGER.info(
            f"Detected folder read mode for '{file_path}' "
            + (
                f"with pattern '{pattern}'."
                if pattern
                else "with no pattern (all files)."
            )
        )
        return self.read_csv_folder(file_path, pattern)

    def _load_and_archive_file(self, sp_file: SharepointFile) -> DataFrame:
        """Download a Sharepoint CSV, stage it locally, load with Spark, and archive it.

        Handles:
        - Writing the CSV to a temporary local path.
        - Reading it as a Spark DataFrame.
        - Archiving goes to the configured success/error subfolders when enabled
        (defaults: "done"/"error").

        Args:
            sp_file: File metadata and content.

        Returns:
            Spark DataFrame.

        Raises:
            ValueError: Empty content.
            Exception: Staging or read failure.
        """
        if self.file_path:
            base_folder = (
                str(Path(self.file_path).parent)
                if "." in Path(self.file_path).name
                else str(Path(self.file_path))
            )
        else:
            base_folder = sp_file._folder if getattr(sp_file, "_folder", None) else None

        success_subfolder = self.opts.archive_success_subfolder or "done"
        error_subfolder = self.opts.archive_error_subfolder or "error"

        success_folder = f"{base_folder}/{success_subfolder}" if base_folder else None
        error_folder = f"{base_folder}/{error_subfolder}" if base_folder else None

        archive_target = error_folder  # default to error unless full read succeeds

        try:
            # IMPORTANT: empty check inside try so finally always runs
            if not sp_file.content:
                raise ValueError(
                    f"File '{getattr(sp_file, 'file_path', None) or self.file_path}' "
                    "is empty or could not be downloaded."
                )

            sp_file, df = self._load_csv_to_spark(sp_file)
            archive_target = success_folder  # only mark success after full read

            _LOGGER.info(
                f"Successfully read '{sp_file.file_path}' into Spark DataFrame."
            )
            df = df.cache()
            df.count()  # Force materialization
            return df

        except Exception as e:
            _LOGGER.error(f"Error processing '{sp_file.file_name}': {e}")
            raise

        finally:
            self.sharepoint_utils.archive_sharepoint_file(
                sp_file=sp_file,
                to_path=archive_target,
                move_enabled=self.opts.archive_enabled,
            )

    def _get_csv_files_in_folder(
        self, folder_path: str, pattern: str = None
    ) -> list[SharepointFile]:
        """List CSV files in a Sharepoint folder, optionally filtered by pattern.

        Args:
            folder_path: Sharepoint folder path.
            pattern: Optional glob/substring pattern.

        Returns:
            List of SharepointFile.
        """
        items = self.sharepoint_utils.list_items_in_path(folder_path)
        files = []

        if pattern:
            _LOGGER.info(
                f"""Filtering Sharepoint files in '{folder_path}' using glob-style
                pattern: '{pattern}'.
                Ensure your pattern uses wildcards (e.g., '*.csv', 'sales_*.csv').
                """
            )

        for item in items:
            file = SharepointFile(
                file_name=item["name"],
                time_created=item.get("createdDateTime", ""),
                time_modified=item.get("lastModifiedDateTime", ""),
                _folder=folder_path,
            )

            if not file.is_csv:
                continue

            if pattern and not fnmatch.fnmatch(file.file_name, pattern):
                continue

            files.append(file)

        return sorted(files, key=lambda f: f.file_name)

    def _load_csv_to_spark(
        self, sp_file: SharepointFile
    ) -> tuple[SharepointFile, DataFrame]:
        """Load a staged CSV into Spark and return file + DataFrame.

        Args:
            sp_file: Sharepoint file metadata.

        Returns:
            (SharepointFile, Spark DataFrame).

        Raises:
            ValueError: Empty or undownloadable file.
        """
        sp_file = self.sharepoint_utils.get_file_metadata(sp_file.file_path)

        local_file = self.sharepoint_utils.save_to_staging_area(sp_file)

        spark_options = self.resolve_spark_csv_options(sp_file.content)

        try:
            _LOGGER.info(f"Starting to read file: {sp_file.file_name}")
            start_time = time.time()
            df = (
                ExecEnv.SESSION.read.format("csv")
                .options(**spark_options)
                .load(str(local_file))
                .cache()
            )
            _LOGGER.info(f"""Finished reading file: {sp_file.file_name} in
                {round(time.time() - start_time, 2)} seconds""")
            df.count()  # force materialization

            return sp_file, df

        except Exception as e:
            _LOGGER.error(
                f"Failed to read local copy of Sharepoint file: {local_file}",
                exc_info=True,
            )
            raise ValueError(
                f"Failed to read Sharepoint file: '{sp_file.file_path}'."
            ) from e

    def read_csv_folder(self, folder_path: str, pattern: str = None) -> DataFrame:
        """Read and combine CSVs from a Sharepoint folder.

        If a pattern is provided, only files whose names contain the pattern will be
        read.
        Only archives files to the configured success subfolder if the full read
        and union succeeds.
        Files causing schema mismatches or other read issues are moved to the
        configured error subfolder (when enabled).

        Args:
            folder_path: Sharepoint folder path.
            pattern: Optional substring filter for filenames.

        Returns:
            Combined Spark DataFrame.

        Raises:
            ValueError: No valid files or schema mismatches.
        """
        files = self._get_csv_files_in_folder(folder_path, pattern)
        if not files:
            raise ValueError(f"No CSV files found in folder: {folder_path}")

        valid_files, dfs = [], []
        base_schema = None

        for file in files:
            try:
                file_with_content, df = self._validate_and_read_file(file, base_schema)
                base_schema = base_schema or df.schema
                dfs.append(df)
                valid_files.append(file_with_content)
            except Exception as e:
                self._handle_file_error(file, e)
                raise

        if not dfs:
            raise ValueError("No valid CSV files could be loaded from folder.")

        combined = reduce(lambda a, b: a.unionByName(b), dfs).cache()
        combined.count()

        for sp_file in valid_files:
            self.sharepoint_utils.archive_sharepoint_file(
                sp_file,
                to_path=(
                    f"{folder_path}/{self.opts.archive_success_subfolder}"
                    if self.opts.archive_success_subfolder
                    else None
                ),
                move_enabled=self.opts.archive_enabled,
            )

        return combined

    def _validate_and_read_file(
        self,
        file: SharepointFile,
        base_schema: Optional[StructType],
    ) -> tuple[SharepointFile, DataFrame]:
        """Validate schema and read CSV file into a Spark DataFrame.

        Args:
            file: Sharepoint file to read.
            base_schema: Schema to validate against.

        Returns:
            (validated SharepointFile, DataFrame).

        Raises:
            ValueError: Schema mismatch.
        """
        file_with_content, df = self._load_csv_to_spark(file)

        if base_schema and df.schema != base_schema:
            _LOGGER.error(
                f"""Schema mismatch in '{file.file_name}'. Expected: {base_schema},
                Found: {df.schema}"""
            )
            self.sharepoint_utils.archive_sharepoint_file(
                sp_file=file_with_content,
                to_path=self.error_folder,
                move_enabled=self.opts.archive_enabled,
            )
            raise ValueError(f"Schema mismatch in '{file.file_name}'")

        return file_with_content, df

    def _handle_file_error(
        self,
        file: SharepointFile,
        error: Exception,
    ) -> None:
        """Handle file read or processing errors by logging and archiving.

        Logs the error, prevents duplicate archiving, and moves the file
        to the error subfolder when enabled. Falls back gracefully if
        archiving fails.

        Args:
            file: Problematic SharepointFile.
            error: Exception encountered.
        """
        _LOGGER.error(f"Error processing '{file.file_name}': {error}")
        if not getattr(file, "_already_archived", False):
            file.skip_rename = True
            try:
                self.sharepoint_utils.archive_sharepoint_file(
                    sp_file=file,
                    to_path=self.error_folder,
                    move_enabled=self.opts.archive_enabled,
                )
                file._already_archived = True
            except Exception as archive_error:
                _LOGGER.warning(f"Secondary archiving failed: {archive_error}")
        else:
            _LOGGER.info(
                f"Skipping second archive for '{file.file_name}' (already archived)"
            )

    def detect_delimiter(
        self,
        file_content: bytes,
        provided_delimiter: Optional[str] = None,
        expected_columns: Optional[list] = None,
    ) -> str:
        """Detect the appropriate delimiter for a CSV file.

        If a delimiter is explicitly provided by the user, it will be used directly
        (sniffing is bypassed).
        Otherwise, attempts to auto-detect the delimiter using csv.Sniffer based on the
        first line or expected columns.

        Args:
            file_content: Raw CSV bytes.
            provided_delimiter: Explicit delimiter to use.
            expected_columns: Optional expected header names.

        Returns:
            Final delimiter.

        Raises:
            ValueError: Unable to determine delimiter.
        """
        if provided_delimiter:
            _LOGGER.info(f"User-specified delimiter '{provided_delimiter}' selected.")
            return provided_delimiter

        try:
            text = file_content.decode("utf-8")
            dialect = csv.Sniffer().sniff(text, delimiters=";,|\t")
            detected_delimiter = dialect.delimiter

            _LOGGER.info(
                f"No user-specified delimiter. Auto-detected: '{detected_delimiter}'"
            )

            first_line = text.splitlines()[0].strip()
            actual_column_count = len(first_line.split(detected_delimiter))

            if expected_columns:
                expected_count = len(expected_columns)
                if actual_column_count != expected_count:
                    _LOGGER.warning(
                        f"""Detected delimiter '{detected_delimiter}' resulted in
                        {actual_column_count} columns,
                        but {expected_count} were expected. Consider specifying
                        the delimiter explicitly."""
                    )
            elif actual_column_count <= 1:
                _LOGGER.warning(
                    f"""Detected delimiter '{detected_delimiter}' resulted in only
                    {actual_column_count} column.
                     Consider specifying the delimiter explicitly in
                     'sharepoint_opts.local_options'."""
                )

            return detected_delimiter

        except Exception as e:
            _LOGGER.warning(
                f"Failed to auto-detect delimiter. Defaulting to comma. Reason: {e}"
            )
            return ","

    def resolve_spark_csv_options(self, file_content: bytes) -> dict:
        """Resolve Spark CSV read options by validating or detecting delimiter.

        Args:
            file_content: Raw file bytes.

        Returns:
            Dict of Spark CSV options (includes delimiter).
        """
        local_options = self._input_spec.sharepoint_opts.local_options or {}

        if "sep" in local_options:
            user_delimiter = local_options["sep"]
        elif "delimiter" in local_options:
            user_delimiter = local_options["delimiter"]
        else:
            user_delimiter = None

        expected_columns = local_options.get("expected_columns")

        final_delimiter = self.detect_delimiter(
            file_content=file_content,
            provided_delimiter=user_delimiter,
            expected_columns=expected_columns,
        )

        # Warn if expected column names do not match the header when using the selected
        # delimiter
        if expected_columns:
            try:
                header_line = file_content.decode("utf-8").splitlines()[0].strip()
                actual_columns = [c.strip() for c in header_line.split(final_delimiter)]

                expected_normalized = [str(c).strip().lower() for c in expected_columns]
                actual_normalized = [c.strip().lower() for c in actual_columns]

                if actual_normalized != expected_normalized:
                    _LOGGER.warning(
                        "Expected columns don't match CSV header using delimiter '%s'. "
                        "Expected: %s vs. Actual: %s. The read will proceed; "
                        "consider specifying the correct delimiter or "
                        "updating expected_columns.",
                        final_delimiter,
                        expected_columns,
                        actual_columns,
                    )
            except Exception as e:
                _LOGGER.warning(
                    "Failed to validate expected_columns against CSV header. "
                    "The read will proceed. Reason: %s",
                    e,
                )

        # Safety fallback if detector returned nothing for some reason
        final_delimiter = final_delimiter or ","

        spark_options = dict(local_options)
        spark_options["sep"] = final_delimiter
        # Remove "delimiter" to avoid ambiguity as spark uses "sep"
        spark_options.pop("delimiter", None)

        return spark_options

`detect_delimiter(file_content, provided_delimiter=None, expected_columns=None)` ¶

Detect the appropriate delimiter for a CSV file.

If a delimiter is explicitly provided by the user, it will be used directly (sniffing is bypassed). Otherwise, attempts to auto-detect the delimiter using csv.Sniffer based on the first line or expected columns.

Parameters:

Name	Type	Description	Default
`file_content`	`bytes`	Raw CSV bytes.	required
`provided_delimiter`	`Optional[str]`	Explicit delimiter to use.	`None`
`expected_columns`	`Optional[list]`	Optional expected header names.	`None`

Returns:

Type	Description
`str`	Final delimiter.

Raises:

Type	Description
`ValueError`	Unable to determine delimiter.

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

def detect_delimiter(
    self,
    file_content: bytes,
    provided_delimiter: Optional[str] = None,
    expected_columns: Optional[list] = None,
) -> str:
    """Detect the appropriate delimiter for a CSV file.

    If a delimiter is explicitly provided by the user, it will be used directly
    (sniffing is bypassed).
    Otherwise, attempts to auto-detect the delimiter using csv.Sniffer based on the
    first line or expected columns.

    Args:
        file_content: Raw CSV bytes.
        provided_delimiter: Explicit delimiter to use.
        expected_columns: Optional expected header names.

    Returns:
        Final delimiter.

    Raises:
        ValueError: Unable to determine delimiter.
    """
    if provided_delimiter:
        _LOGGER.info(f"User-specified delimiter '{provided_delimiter}' selected.")
        return provided_delimiter

    try:
        text = file_content.decode("utf-8")
        dialect = csv.Sniffer().sniff(text, delimiters=";,|\t")
        detected_delimiter = dialect.delimiter

        _LOGGER.info(
            f"No user-specified delimiter. Auto-detected: '{detected_delimiter}'"
        )

        first_line = text.splitlines()[0].strip()
        actual_column_count = len(first_line.split(detected_delimiter))

        if expected_columns:
            expected_count = len(expected_columns)
            if actual_column_count != expected_count:
                _LOGGER.warning(
                    f"""Detected delimiter '{detected_delimiter}' resulted in
                    {actual_column_count} columns,
                    but {expected_count} were expected. Consider specifying
                    the delimiter explicitly."""
                )
        elif actual_column_count <= 1:
            _LOGGER.warning(
                f"""Detected delimiter '{detected_delimiter}' resulted in only
                {actual_column_count} column.
                 Consider specifying the delimiter explicitly in
                 'sharepoint_opts.local_options'."""
            )

        return detected_delimiter

    except Exception as e:
        _LOGGER.warning(
            f"Failed to auto-detect delimiter. Defaulting to comma. Reason: {e}"
        )
        return ","

`read(file_path=None, pattern=None)` ¶

Read CSV data from Sharepoint.

Parameters:

Name	Type	Description	Default
`file_path`	`str`	Full file or folder path (overrides options if provided).	`None`
`pattern`	`str`	Optional substring filter for folder mode.	`None`

Returns:

Type	Description
`DataFrame`	Spark DataFrame.

Raises:

Type	Description
`ValueError`	Invalid/missing path or path not found.

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

def read(self, file_path: str = None, pattern: str = None) -> DataFrame:
    """Read CSV data from Sharepoint.

    Args:
        file_path: Full file or folder path (overrides options if provided).
        pattern: Optional substring filter for folder mode.

    Returns:
        Spark DataFrame.

    Raises:
        ValueError: Invalid/missing path or path not found.
    """
    file_path = file_path or self.file_path
    pattern = pattern or self.pattern

    if not file_path:
        raise ValueError(
            """`file_name` or `folder_relative_path` must be provided via
            sharepoint_opts."""
        )

    # Case 1: file_path includes a file (e.g., folder/file.csv or full path)
    if "." in Path(file_path).name:
        sp_file = self.sharepoint_utils.get_file_metadata(file_path)
        _LOGGER.info(f"Detected single-file read mode for '{file_path}'.")
        return self._load_and_archive_file(sp_file)

    # Case 2: it's a folder — use optional pattern
    if not self.sharepoint_utils.check_if_endpoint_exists(file_path):
        raise ValueError(f"Folder '{file_path}' does not exist in Sharepoint.")

    _LOGGER.info(
        f"Detected folder read mode for '{file_path}' "
        + (
            f"with pattern '{pattern}'."
            if pattern
            else "with no pattern (all files)."
        )
    )
    return self.read_csv_folder(file_path, pattern)

`read_csv_folder(folder_path, pattern=None)` ¶

Read and combine CSVs from a Sharepoint folder.

If a pattern is provided, only files whose names contain the pattern will be read. Only archives files to the configured success subfolder if the full read and union succeeds. Files causing schema mismatches or other read issues are moved to the configured error subfolder (when enabled).

Parameters:

Name	Type	Description	Default
`folder_path`	`str`	Sharepoint folder path.	required
`pattern`	`str`	Optional substring filter for filenames.	`None`

Returns:

Type	Description
`DataFrame`	Combined Spark DataFrame.

Raises:

Type	Description
`ValueError`	No valid files or schema mismatches.

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

def read_csv_folder(self, folder_path: str, pattern: str = None) -> DataFrame:
    """Read and combine CSVs from a Sharepoint folder.

    If a pattern is provided, only files whose names contain the pattern will be
    read.
    Only archives files to the configured success subfolder if the full read
    and union succeeds.
    Files causing schema mismatches or other read issues are moved to the
    configured error subfolder (when enabled).

    Args:
        folder_path: Sharepoint folder path.
        pattern: Optional substring filter for filenames.

    Returns:
        Combined Spark DataFrame.

    Raises:
        ValueError: No valid files or schema mismatches.
    """
    files = self._get_csv_files_in_folder(folder_path, pattern)
    if not files:
        raise ValueError(f"No CSV files found in folder: {folder_path}")

    valid_files, dfs = [], []
    base_schema = None

    for file in files:
        try:
            file_with_content, df = self._validate_and_read_file(file, base_schema)
            base_schema = base_schema or df.schema
            dfs.append(df)
            valid_files.append(file_with_content)
        except Exception as e:
            self._handle_file_error(file, e)
            raise

    if not dfs:
        raise ValueError("No valid CSV files could be loaded from folder.")

    combined = reduce(lambda a, b: a.unionByName(b), dfs).cache()
    combined.count()

    for sp_file in valid_files:
        self.sharepoint_utils.archive_sharepoint_file(
            sp_file,
            to_path=(
                f"{folder_path}/{self.opts.archive_success_subfolder}"
                if self.opts.archive_success_subfolder
                else None
            ),
            move_enabled=self.opts.archive_enabled,
        )

    return combined

`resolve_spark_csv_options(file_content)` ¶

Resolve Spark CSV read options by validating or detecting delimiter.

Parameters:

Name	Type	Description	Default
`file_content`	`bytes`	Raw file bytes.	required

Returns:

Type	Description
`dict`	Dict of Spark CSV options (includes delimiter).

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

def resolve_spark_csv_options(self, file_content: bytes) -> dict:
    """Resolve Spark CSV read options by validating or detecting delimiter.

    Args:
        file_content: Raw file bytes.

    Returns:
        Dict of Spark CSV options (includes delimiter).
    """
    local_options = self._input_spec.sharepoint_opts.local_options or {}

    if "sep" in local_options:
        user_delimiter = local_options["sep"]
    elif "delimiter" in local_options:
        user_delimiter = local_options["delimiter"]
    else:
        user_delimiter = None

    expected_columns = local_options.get("expected_columns")

    final_delimiter = self.detect_delimiter(
        file_content=file_content,
        provided_delimiter=user_delimiter,
        expected_columns=expected_columns,
    )

    # Warn if expected column names do not match the header when using the selected
    # delimiter
    if expected_columns:
        try:
            header_line = file_content.decode("utf-8").splitlines()[0].strip()
            actual_columns = [c.strip() for c in header_line.split(final_delimiter)]

            expected_normalized = [str(c).strip().lower() for c in expected_columns]
            actual_normalized = [c.strip().lower() for c in actual_columns]

            if actual_normalized != expected_normalized:
                _LOGGER.warning(
                    "Expected columns don't match CSV header using delimiter '%s'. "
                    "Expected: %s vs. Actual: %s. The read will proceed; "
                    "consider specifying the correct delimiter or "
                    "updating expected_columns.",
                    final_delimiter,
                    expected_columns,
                    actual_columns,
                )
        except Exception as e:
            _LOGGER.warning(
                "Failed to validate expected_columns against CSV header. "
                "The read will proceed. Reason: %s",
                e,
            )

    # Safety fallback if detector returned nothing for some reason
    final_delimiter = final_delimiter or ","

    spark_options = dict(local_options)
    spark_options["sep"] = final_delimiter
    # Remove "delimiter" to avoid ambiguity as spark uses "sep"
    spark_options.pop("delimiter", None)

    return spark_options

`SharepointExcelReader` ¶

Bases: SharepointReader

Read Excel files from Sharepoint (not yet implemented).

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

class SharepointExcelReader(SharepointReader):
    """Read Excel files from Sharepoint (not yet implemented)."""

    def read(self) -> DataFrame:
        """Read Excel files from Sharepoint.

        This method is not yet implemented and currently raises an error.
        Intended for future support of .xlsx file read from Sharepoint folders or files.

        Raises:
            NotImplementedError: Always, since Excel reading is not implemented.
        """
        raise NotImplementedError("Excel reading is not yet implemented.")

`read()` ¶

Read Excel files from Sharepoint.

This method is not yet implemented and currently raises an error. Intended for future support of .xlsx file read from Sharepoint folders or files.

Raises:

Type	Description
`NotImplementedError`	Always, since Excel reading is not implemented.

Source code in mkdocs/lakehouse_engine/packages/io/readers/sharepoint_reader.py

def read(self) -> DataFrame:
    """Read Excel files from Sharepoint.

    This method is not yet implemented and currently raises an error.
    Intended for future support of .xlsx file read from Sharepoint folders or files.

    Raises:
        NotImplementedError: Always, since Excel reading is not implemented.
    """
    raise NotImplementedError("Excel reading is not yet implemented.")