MS365 Duplicate Files

Details

Summary

New with version 4.0 is the ability to scan for truely duplicated files in Microsoft 365. Version 4.0 collects the content hash from the files in Microsoft 365 so that the duplicates can be found and remediated.

Details

Duplicated files can be simply redundant taking up extra space or they can be a security risk. Files get downloaded from SharePoint where the security is more regulated to someones OneDrive or MySite and then shared. This could potentially be a big problem.

This report simply helps to find the duplicated files, even if they have been renamed. These files could be spread across any of the SharePoint Document Libraries, Team Drives, or OneDrives drive categories.

The query can be easily modified to limit which drive categories to report on. The drive categories include Sharepoint, Teams, and OneDrive. Simply edit the where clause on line 65 to only include the desired drive category(s).

Code

WITH paths AS (
    SELECT
        di.name,
        di.size,
        di.created_by,
        di.modified_by,
        COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count,
        di.item_type,
        RIGHT(pp.web_url, LEN(pp.web_url) - LEN(d.web_url)) AS parent_path,
        d.web_url AS drive_path,
        srs.bytes_to_hex_string(di.file_hash) as file_hash,
        CASE
          WHEN udm.id IS NOT NULL THEN 'OneDrive'
          WHEN gdm.id IS NOT NULL THEN 'Teams'
          ELSE 'SharePoint'
        END AS drive_category
    FROM ms365.drive_items AS di
    JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id
    JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id
    LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id
    LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id
    LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id
    WHERE 1=1
      AND ds.scan_state = 1
      AND di.item_type = 1
),
p AS (
    SELECT
        p.drive_path,
        CASE
            WHEN LEN(p.parent_path) = 0 THEN '/'
            ELSE p.parent_path
        END AS parent_path,
        p.name AS filename,
        p.size as item_size,
        SUM(p.size) over (partition by p.file_hash) as total_hash_size,
        srs.byte_string(p.size) AS total_item_size_string,
        srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string,
        (SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size,
        srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string,
        ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum,
        CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by,
        CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by,
        p.file_hash,
        p.total_hash_count,
        p.total_hash_count - 1 AS wasted_hash_count,
        p.drive_category
    FROM
        paths AS p
        LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by
        LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by
        LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by
        LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by
    WHERE 1=1
      AND p.file_hash IS NOT NULL
      AND p.size > 0
      AND p.total_hash_count >= 2
),
x AS (
    SELECT
        p.*,
        SUM(p.item_size) OVER() AS total_size,
        DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank
    FROM p
	WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive')
),
y AS (
    SELECT
        SUM(p.wasted_size) AS total_wasted_size
    FROM p
    WHERE p.hash_rownum = 1
)
SELECT
    x.*,
    y.total_wasted_size
FROM x
CROSS JOIN y
ORDER BY total_hash_count DESC

WITH paths AS (
    SELECT
        di.name,
        di.size,
        di.created_by,
        di.modified_by,
        COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count,
        di.item_type,
        RIGHT(pp.web_url, length(pp.web_url) - length(d.web_url)) AS parent_path,
        d.web_url AS drive_path,
        srs.bytes_to_hex_string(di.file_hash) as file_hash,
        CASE
          WHEN udm.id IS NOT NULL THEN 'OneDrive'
          WHEN gdm.id IS NOT NULL THEN 'Teams'
          ELSE 'SharePoint'
        END AS drive_category
    FROM ms365.drive_items AS di
    JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id
    JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id
    LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id
    LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id
    LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id
    WHERE 1=1
      AND ds.scan_state = 1
      AND di.item_type = 1
),
p AS (
    SELECT
        p.drive_path,
        CASE
            WHEN length(p.parent_path) = 0 THEN '/'
            ELSE p.parent_path
        END AS parent_path,
        p.name AS filename,
        p.size as item_size,
        SUM(p.size) over (partition by p.file_hash) as total_hash_size,
        srs.byte_string(p.size) AS total_item_size_string,
        srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string,
        (SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size,
        srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string,
        ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum,
        CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by,
        CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by,
        p.file_hash,
        p.total_hash_count,
        p.total_hash_count - 1 AS wasted_hash_count,
        p.drive_category
    FROM
        paths AS p
        LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by
        LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by
        LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by
        LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by
    WHERE 1=1
      AND p.file_hash IS NOT NULL
      AND p.size > 0
      AND p.total_hash_count >= 2
),
x AS (
    SELECT
        p.*,
        SUM(p.item_size) OVER() AS total_size,
        DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank
    FROM p
	WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive')
),
y AS (
    SELECT
        SUM(p.wasted_size) AS total_wasted_size
    FROM p
    WHERE p.hash_rownum = 1
)
SELECT
    x.*,
    y.total_wasted_size
FROM x
CROSS JOIN y
ORDER BY total_hash_count DESC

Author

rlagger

Post date

Thursday, April 8, 2021 - 08:41

Last modified

Friday, April 26, 2024 - 12:48

Properties

Supported Version

4.1

4.0

Report Category

Microsoft 365