MS365 Duplicate Files
MS365 Duplicate Files
Details
Summary
New with version 4.0 is the ability to scan for truely duplicated files in Microsoft 365. Version 4.0 collects the content hash from the files in Microsoft 365 so that the duplicates can be found and remediated.
Details
Duplicated files can be simply redundant taking up extra space or they can be a security risk. Files get downloaded from SharePoint where the security is more regulated to someones OneDrive or MySite and then shared. This could potentially be a big problem.
This report simply helps to find the duplicated files, even if they have been renamed. These files could be spread across any of the SharePoint Document Libraries, Team Drives, or OneDrives drive categories.
The query can be easily modified to limit which drive categories to report on. The drive categories include Sharepoint, Teams, and OneDrive. Simply edit the where clause on line 65 to only include the desired drive category(s).
WITH paths AS ( SELECT di.name, di.size, di.created_by, di.modified_by, COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count, di.item_type, RIGHT(pp.web_url, LEN(pp.web_url) - LEN(d.web_url)) AS parent_path, d.web_url AS drive_path, srs.bytes_to_hex_string(di.file_hash) as file_hash, CASE WHEN udm.id IS NOT NULL THEN 'OneDrive' WHEN gdm.id IS NOT NULL THEN 'Teams' ELSE 'SharePoint' END AS drive_category FROM ms365.drive_items AS di JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id WHERE 1=1 AND ds.scan_state = 1 AND di.item_type = 1 ), p AS ( SELECT p.drive_path, CASE WHEN LEN(p.parent_path) = 0 THEN '/' ELSE p.parent_path END AS parent_path, p.name AS filename, p.size as item_size, SUM(p.size) over (partition by p.file_hash) as total_hash_size, srs.byte_string(p.size) AS total_item_size_string, srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string, (SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size, srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string, ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum, CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by, CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by, p.file_hash, p.total_hash_count, p.total_hash_count - 1 AS wasted_hash_count, p.drive_category FROM paths AS p LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by WHERE 1=1 AND p.file_hash IS NOT NULL AND p.size > 0 AND p.total_hash_count >= 2 ), x AS ( SELECT p.*, SUM(p.item_size) OVER() AS total_size, DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank FROM p WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive') ), y AS ( SELECT SUM(p.wasted_size) AS total_wasted_size FROM p WHERE p.hash_rownum = 1 ) SELECT x.*, y.total_wasted_size FROM x CROSS JOIN y ORDER BY total_hash_count DESC
WITH paths AS ( SELECT di.name, di.size, di.created_by, di.modified_by, COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count, di.item_type, RIGHT(pp.web_url, length(pp.web_url) - length(d.web_url)) AS parent_path, d.web_url AS drive_path, srs.bytes_to_hex_string(di.file_hash) as file_hash, CASE WHEN udm.id IS NOT NULL THEN 'OneDrive' WHEN gdm.id IS NOT NULL THEN 'Teams' ELSE 'SharePoint' END AS drive_category FROM ms365.drive_items AS di JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id WHERE 1=1 AND ds.scan_state = 1 AND di.item_type = 1 ), p AS ( SELECT p.drive_path, CASE WHEN length(p.parent_path) = 0 THEN '/' ELSE p.parent_path END AS parent_path, p.name AS filename, p.size as item_size, SUM(p.size) over (partition by p.file_hash) as total_hash_size, srs.byte_string(p.size) AS total_item_size_string, srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string, (SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size, srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string, ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum, CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by, CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by, p.file_hash, p.total_hash_count, p.total_hash_count - 1 AS wasted_hash_count, p.drive_category FROM paths AS p LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by WHERE 1=1 AND p.file_hash IS NOT NULL AND p.size > 0 AND p.total_hash_count >= 2 ), x AS ( SELECT p.*, SUM(p.item_size) OVER() AS total_size, DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank FROM p WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive') ), y AS ( SELECT SUM(p.wasted_size) AS total_wasted_size FROM p WHERE p.hash_rownum = 1 ) SELECT x.*, y.total_wasted_size FROM x CROSS JOIN y ORDER BY total_hash_count DESC
Attachment | Size |
---|---|
MS365-DuplicateFiles.zip | 6.48 KB |
Attachment | Size |
---|---|
MS365-DuplicateFiles.pdf | 266.07 KB |