MS365 Duplicate Files
MS365 Duplicate Files
Details
Summary
New with version 4.0 is the ability to scan for truely duplicated files in Microsoft 365. Version 4.0 collects the content hash from the files in Microsoft 365 so that the duplicates can be found and remediated.
Details
Duplicated files can be simply redundant taking up extra space or they can be a security risk. Files get downloaded from SharePoint where the security is more regulated to someones OneDrive or MySite and then shared. This could potentially be a big problem.
This report simply helps to find the duplicated files, even if they have been renamed. These files could be spread across any of the SharePoint Document Libraries, Team Drives, or OneDrives drive categories.
The query can be easily modified to limit which drive categories to report on. The drive categories include Sharepoint, Teams, and OneDrive. Simply edit the where clause on line 65 to only include the desired drive category(s).
WITH paths AS (
SELECT
di.name,
di.size,
di.created_by,
di.modified_by,
COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count,
di.item_type,
RIGHT(pp.web_url, LEN(pp.web_url) - LEN(d.web_url)) AS parent_path,
d.web_url AS drive_path,
srs.bytes_to_hex_string(di.file_hash) as file_hash,
CASE
WHEN udm.id IS NOT NULL THEN 'OneDrive'
WHEN gdm.id IS NOT NULL THEN 'Teams'
ELSE 'SharePoint'
END AS drive_category
FROM ms365.drive_items AS di
JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id
JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id
LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id
LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id
LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id
WHERE 1=1
AND ds.scan_state = 1
AND di.item_type = 1
),
p AS (
SELECT
p.drive_path,
CASE
WHEN LEN(p.parent_path) = 0 THEN '/'
ELSE p.parent_path
END AS parent_path,
p.name AS filename,
p.size as item_size,
SUM(p.size) over (partition by p.file_hash) as total_hash_size,
srs.byte_string(p.size) AS total_item_size_string,
srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string,
(SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size,
srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string,
ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum,
CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by,
CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by,
p.file_hash,
p.total_hash_count,
p.total_hash_count - 1 AS wasted_hash_count,
p.drive_category
FROM
paths AS p
LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by
LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by
WHERE 1=1
AND p.file_hash IS NOT NULL
AND p.size > 0
AND p.total_hash_count >= 2
),
x AS (
SELECT
p.*,
SUM(p.item_size) OVER() AS total_size,
DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank
FROM p
WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive')
),
y AS (
SELECT
SUM(p.wasted_size) AS total_wasted_size
FROM p
WHERE p.hash_rownum = 1
)
SELECT
x.*,
y.total_wasted_size
FROM x
CROSS JOIN y
ORDER BY total_hash_count DESC
WITH paths AS (
SELECT
di.name,
di.size,
di.created_by,
di.modified_by,
COUNT(*) OVER (PARTITION BY di.file_hash) AS total_hash_count,
di.item_type,
RIGHT(pp.web_url, length(pp.web_url) - length(d.web_url)) AS parent_path,
d.web_url AS drive_path,
srs.bytes_to_hex_string(di.file_hash) as file_hash,
CASE
WHEN udm.id IS NOT NULL THEN 'OneDrive'
WHEN gdm.id IS NOT NULL THEN 'Teams'
ELSE 'SharePoint'
END AS drive_category
FROM ms365.drive_items AS di
JOIN ms365.drives AS d ON d.ms365_id = di.ms365_drive_id
JOIN ms365.drive_scans AS ds ON ds.drive_id = d.id AND ds.id = di.scan_id
LEFT JOIN ms365.drive_items AS pp ON pp.ms365_id = di.ms365_parent_id AND di.scan_id = pp.scan_id
LEFT JOIN ms365.group_drives AS gdm ON gdm.ms365_drive_id = d.ms365_id
LEFT JOIN ms365.user_drives AS udm ON udm.ms365_drive_id = d.ms365_id
WHERE 1=1
AND ds.scan_state = 1
AND di.item_type = 1
),
p AS (
SELECT
p.drive_path,
CASE
WHEN length(p.parent_path) = 0 THEN '/'
ELSE p.parent_path
END AS parent_path,
p.name AS filename,
p.size as item_size,
SUM(p.size) over (partition by p.file_hash) as total_hash_size,
srs.byte_string(p.size) AS total_item_size_string,
srs.byte_string(CAST(SUM(p.size) over (partition by p.file_hash) as bigint)) as total_hash_size_string,
(SUM(p.size) over (partition by p.file_hash) - p.size) as wasted_size,
srs.byte_string(CAST((SUM(p.size) over (partition by p.file_hash) - p.size) as bigint)) as wasted_size_string,
ROW_NUMBER() OVER (partition by p.file_hash order by p.size) AS hash_rownum,
CASE WHEN uc.display_name IS NOT NULL THEN uc.display_name ELSE gc.display_name END AS created_by,
CASE WHEN um.display_name IS NOT NULL THEN um.display_name ELSE gm.display_name END AS modified_by,
p.file_hash,
p.total_hash_count,
p.total_hash_count - 1 AS wasted_hash_count,
p.drive_category
FROM
paths AS p
LEFT OUTER JOIN ms365.users AS uc ON uc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.users AS um ON um.ms365_id = p.modified_by
LEFT OUTER JOIN ms365.groups AS gc ON gc.ms365_id = p.created_by
LEFT OUTER JOIN ms365.groups AS gm ON gm.ms365_id = p.modified_by
WHERE 1=1
AND p.file_hash IS NOT NULL
AND p.size > 0
AND p.total_hash_count >= 2
),
x AS (
SELECT
p.*,
SUM(p.item_size) OVER() AS total_size,
DENSE_RANK() OVER (ORDER BY p.wasted_size DESC) AS wasted_size_rank
FROM p
WHERE p.drive_category in ('SharePoint', 'Teams', 'OneDrive')
),
y AS (
SELECT
SUM(p.wasted_size) AS total_wasted_size
FROM p
WHERE p.hash_rownum = 1
)
SELECT
x.*,
y.total_wasted_size
FROM x
CROSS JOIN y
ORDER BY total_hash_count DESC
| Attachment | Size |
|---|---|
| 6.48 KB |
| Attachment | Size |
|---|---|
| 266.07 KB |