## Software for anonymization. ## (c) Vicenç Torra ## Current version: 20221201 ## ## References on data privacy: ## V. Torra, Guide to data privacy, Springer, 2022. ## ## Main functions: ## ## Masking methods: ## mdav (db, k) ## MDAV, k size of clusters ## noiseAddition (x, k) ## noise addition, k parameter for quantity of noise ## noiseAdditionLaplace (x, k) ## noise addition Laplace, k parameter for quantity of noise ## maskingNMF(data, quality) ## masking using non-negative matrix factorization, parameter: quality ## maskingSVD (data, quality) ## masking using singular value decomposition, parameter: quality ## maskingPCA (data, quality) ## masking using principal components, parameter: quality ## mondrian (db, k) ## mondrian ## ## Information loss: ## sdcIL_stats (x, xp) ## sdcIL_mlRE (x, xp, y, testX, testY, modelRegression) ## sdcIL_mlRE_GivenOErr (oError, xp, y, testX, testY, modelRegression) ## ## Disclosure risk using record linkage ## sdcRecordLinkage (x, xp): ## sdcRecordLinkageLoL (x, xp, varsX, varsXp): ## ## Other functions related to masking ## maFromCl2DB (db, assignedCl) ## from clusters to values ## mdavCl (db, k) ## mdav clusters ## mondrianCl (db, k) ## mondrian clusters ## ## ## OTHER FUNCTIONS for microaggregation ## remainingCl(clNumber, assignedClass) ## from clusters to ## updateCl (indexCls, clNumber, assignedClass): ## kClosestToVect (db, assignments, vect, k): ## orderWithList (toOrder, db, indexDb): ## addOrderedWithList(toOrder, dbOrder, indexDb, d, vect, indexVect): ## ## OTHER FUNCTIONS for PCA ## graphFromDiagonal2MatrixRC (nRows, nColumns, diagonal): ## ## OTHER FUNCTIONS for Mondrian: ## mondrianWithUnassigned (db, k, assignedCl=None, firstNCl = 0, stillToProcess = -1): ## partitionAttributeValue(unassigned, selectedAttribute): ## indexLowMidHighValues(db, assignedCl, idCl, midValue, selectedAttribute): ## exec(open("prog.vectors.matrices.web.txt").read()) import numpy as np ## This function computes a masked file from the db and an assignment of records to clusters ## This is the common part of both mdav and mondrian def maFromCl2DB (db, assignedCl): values = set(assignedCl) centroids = [] for cl in values: rowsCl = selectFv(db,assignedCl, lambda x:x==cl) meanRow = matrixColumnMeans(rowsCl) centroids.append(meanRow) newDb = [centroids[assignedCl[i]] for i in range(0,len(db))] return(newDb) ## Examples: ## mdav([[100],[200],[300],[400]],3) ## mdav([[100],[200],[300],[400],[500],[600]],3) ## mdav([[100],[200],[300],[400],[500],[600],[700]],3) ## mdav([[100],[100],[200],[300],[400],[500],[600],[700]],3) ## mdav([[100],[100],[100],[200],[300],[400],[500],[600],[700],[700]],3) ## mdav([[100],[100],[100],[200],[300],[400],[500],[600],[700],[700]],3) ## mdav([[100],[100],[100],[200],[300],[400],[500],[600],[600],[700],[700]],3) ## mdav([[100],[100],[100],[200],[300],[400],[500],[600],[600],[600],[700],[700]],3) ## mdav([[100],[100],[100],[200],[300],[400],[500],[600],[650],[600],[700],[700]],3) ## f1080_fHeader, f1080_fRows = readNumericalFileWithHeader("1080.csv") ## CASC dataset ## f1080_fMeans, f1080_fSD, f1080_fRowsNorm = normalizeMatrixByColumn(f1080_fRows) ## mdav(f1080_fRows, 3) ## mdav(f1080_fRows, 3, distWeights2WEuclidean ([0.6,0.4]+[0]*11)) def mdav (db, k, vDistance=fNorm2): assignedCl = mdavCl(db,k,vDistance) values = set(assignedCl) centroids = [] for cl in values: rowsCl = selectFv(db,assignedCl, lambda x:x==cl) meanRow = matrixColumnMeans(rowsCl) centroids.append(meanRow) newDb = [centroids[assignedCl[i]] for i in range(0,len(db))] return(newDb) ## Function: ## clustering MDAV for a database db and parameter k ## Examples: ## mdavCl([[100],[200],[300],[400]],3) ## mdavCl([[100],[200],[300],[400],[500],[600]],3) ## mdavCl([[100],[200],[300],[400],[500],[600],[700]],3) ## mdavCl([[100],[100],[200],[300],[400],[500],[600],[700]],3) ## mdavCl([[100],[100],[100],[200],[300],[400],[500],[600],[700],[700]],3) ## mdavCl([[100],[100],[100],[200],[300],[400],[500],[600],[700],[700]],3) ## mdavCl([[100],[100],[100],[200],[300],[400],[500],[600],[600],[700],[700]],3) ## mdavCl([[100],[100],[100],[200],[300],[400],[500],[600],[600],[600],[700],[700]],3) ## mdavCl([[100],[100],[100],[200],[300],[400],[500],[600],[650],[600],[700],[700]],3) ## mdavCl(f1080_fRows, 3) ## mdavCl(f1080_fRows, 3, fNorm2) ## mdavCl(f1080_fRows, 3, distWeights2WEuclidean ([0.6,0.4]+[0]*11)) def mdavCl (db, k, vDistance=fNorm2): ## if (len(db)<2*k): ## cl = [0]*len(db) ## else: assignedClass = [-1]*len(db) C = [] clNumber = -1 nPendingElements = len(db) while (nPendingElements>=3*k): unassigned = selectFv(db,assignedClass, lambda x:x==-1) meanX = matrixColumnMeans(unassigned) xr = farthestRow(unassigned, meanX, vDistance) xs = farthestRow(unassigned, unassigned[xr], vDistance) #print("xr="+str(xr)) #print("xs="+str(xs)) toO, dbO, indexCr = kClosestToVect (db, assignedClass, unassigned[xr], k, vDistance) clNumber=clNumber+1 assignedClass = updateCl(indexCr, clNumber, assignedClass) toO, dbO, indexCs = kClosestToVect (db, assignedClass, unassigned[xs], k, vDistance) clNumber=clNumber+1 assignedClass = updateCl(indexCs, clNumber, assignedClass) nPendingElements = nPendingElements-2*k if (nPendingElements>=2*k): unassigned = selectFv(db,assignedClass, lambda x:x==-1) meanX = matrixColumnMeans(unassigned) xr = farthestRow(unassigned, meanX, vDistance) #print("xr="+str(xr)) toO, dbO, indexCr = kClosestToVect (db, assignedClass, unassigned[xr], k, vDistance) clNumber=clNumber+1 assignedClass = updateCl(indexCr, clNumber, assignedClass) nPendingElements=nPendingElements-k clNumber=clNumber+1 assignedClass = remainingCl(clNumber, assignedClass) return assignedClass # Function: # assign unassigned positions to class clNumber # Example: # remainingCl(500,[-1,-1,2,3,4,5,6,-1,8,9,0]) def remainingCl(clNumber, assignedClass): for i in range(0,len(assignedClass)): if (assignedClass[i]==-1): assignedClass[i]=clNumber return (assignedClass) ## remainingCl(500,[-1,-1,2,3,4,5,6,-1,8,9,0]) # Function: # add to all indices in indexCs the class identifier: clNumber # Example: # updateCl([0,1,2,6], 500, [0,1,2,3,4,5,6,7,8,9,10]) # NOTE: # the index should be within the range def updateCl (indexCls, clNumber, assignedClass): for i in range(0,len(indexCls)): assignedClass[indexCls[i]]=clNumber return assignedClass # Function: # select the nearest k rows in Db and return them with the corresponding assignments # Example: # kClosestToVect([[50],[60],[10],[40],[80],[70]],[-1,-1,-1,-1,-1,-1], [40], 3) == ([0, 100, 400], [[40], [50], [60]], [3, 0, 1]) # kClosestToVect([[50],[60],[10],[40],[80],[70]],[-1,-1,2,-1,-1,1], [40], 3) == ([0, 100, 400], [[40], [50], [60]], [3, 0, 1]) def kClosestToVect (db, assignments, vect, k, vDistance=fNorm2): toOrder = [] dbOrder = [] indexDb = [] i = 0 addedRows = 0 while (addedRows < k): if (assignments[i]==-1): #print("addedRows="+str(addedRows)+":(v,db[i="+str(i)+"])="+str(vect)+","+str(db[i])) toOrder.append(vDistance(vect, db[i])) dbOrder.append(db[i]) indexDb.append(i) addedRows = addedRows + 1 i = i+1 toOrder, dbOrder, indexDb = orderWithList(toOrder, dbOrder, indexDb) while (id): i=0 while (toOrder[i]<=d): i = i+1 j=l-1 ## toOrder[i]>d, put at toOrder[i]=d while (j>i): toOrder[j]=toOrder[j-1] dbOrder[j]=dbOrder[j-1] indexDb[j]=indexDb[j-1] j=j-1 toOrder[i]=d dbOrder[i]=vect indexDb[i]=indexVect return((toOrder,dbOrder,indexDb)) ## Function: NOTE normal (0, variance = sigma^2) sd = \sqrt(variance) ## additive noise with parameter k ## Example: ## noiseAddition (mat, 0) ## noiseAddition (mat, 1) ## noiseAddition (mat, 2) def noiseAddition (x, k): variance = matrixColumnVariance(x) vectorParam = list(map(lambda v:np.sqrt(v*k), variance)) xp = [] for i in range(0,len(x)): xp.append(vectorSum(x[i], list(map(lambda sd:np.random.normal(0, sd), vectorParam)))) return(xp) ## NOTE laplace (0, b) Variance = 2 b^2 b = sqr(variance/2) ## def noiseAdditionLaplace (x, k): variance = matrixColumnVariance(x) vectorParam = list(map(lambda v:np.sqrt(v*k/2), variance)) xp = [] for i in range(0,len(x)): xp.append(vectorSum(x[i], list(map(lambda b:np.random.laplace(0, b), vectorParam)))) return(xp) from sklearn.decomposition import NMF ## Function: ## masking using non-negative matrix factorization, with parameter quality (n_components) ## Example: ## maskingNMF(mat, 1) ## maskingNMF(mat, 2) def maskingNMF(data, quality, scale=False): if scale: lMin, lMax, data = matrixScaleMinMax (data) dArray = np.array(data) nmf = NMF(n_components=quality, init="random", max_iter=1000) # maskingNMF() ## mat, 2, None) W = nmf.fit_transform(dArray) H = nmf.components_; dArrayMasked = np.dot(W,H) if scale: dArrayMasked = matrixUnScaleMinMax (lMin, lMax, dArrayMasked) # print(dArrayMasked) ## %%# NMF(beta=0.001, eta=0.0001, init='random', max_iter=2000,nls_max_iter=20000, random_state=0, sparseness=None,tol=0.001) ## d = model.fit_transform(data, y=None, W=None, H=None) return(dArrayMasked) #sdcIL_stats(f1080_fRows,maskingNMF(f1080_fRows, 13)) #sdcIL_stats(f1080_fRows,maskingSVD(f1080_fRows, 13)) #testa0 = list(map(lambda x:sdcIL_stats(f1080_fRows,maskingSVD(f1080_fRows, x)), [5,10,15,20,25,30,40,50,60])) #testa1 = list(map(lambda x:sdcIL_stats(f1080_fRows,maskingNMF(f1080_fRows, x)), [5,10,15,20,25,30,40,50,60,70,80])) #testa2 = list(map(lambda x:sdcIL_stats(f1080_fRows,maskingNMF(f1080_fRows, x)), [5,10,15,20,25,30,40,50,60,70,80])) #testa3 = list(map(lambda x:sdcIL_stats(f1080_fRows,maskingNMF(f1080_fRows, x)), [5,10,15,20,25,30,40,50,60,70,80,90,100])) import copy ## Function: ## masking using singular value decomposition, with parameter quality (n_components) ## Example: ## maskingSVD(mat, 1) ## maskingSVD(mat, 2) def maskingSVD (data, quality): nRows = len(data) kMin = min(quality, nRows) U, s, Vh = np.linalg.svd(np.array(data)) sZero = copy.deepcopy(s) #for i in range(k+1, len(s)): # sZero[i]=0 mS = graphFromDiagonal2MatrixRC (nRows, len(data[0]), sZero) Uk = U[:,:kMin] mSk = mS[:kMin,:kMin] Vhk = Vh[:kMin,:] # SVhk = mSk.dot(Vhk) USVhk = Uk.dot(mSk.dot(Vhk)) return(USVhk) # return(U, mS, Vh, USVhk, s) def graphFromDiagonal2MatrixRC (nRows, nColumns, diagonal): """ From the diagonal of a matrix to a matrix of nRowsxnColumns Example: graphFromDiagonal2MatrixRC (3, 4, [1,2,3]) """ S = np.zeros((nRows, nColumns)) lD = len(diagonal) S[:lD, :lD] = np.diag(diagonal) return(S) from sklearn.decomposition import PCA ## Function: ## masking using PCA, with parameter quality (n_components) ## Example: ## maskingPCA(mat, 1) ## maskingPCA(mat, 2) def maskingPCA (data, quality): pca = PCA(n_components=quality) data_pca = pca.fit_transform(data) data_pca_invers = pca.inverse_transform(data_pca) return(data_pca_invers) # testpca1 = list(map(lambda x:sdcIL_stats(f1080_fRows,maskingPCA(f1080_fRows, x)), [1,2,3,4,5,6,7,8,9,10,11,12,13])) # testpca1 = list(map(lambda x:sdcIL_stats(teX,maskingPCA(teX, x)), [1,2,3,4,5,6,7,8,9,10])) ## Information Loss: IL ------------------------------------------ ## Function: ## a function to compute information loss in terms of ## 1) difference of means ## 2) difference of standard deviations ## 3) maximum difference between the two matrices ## Example: ## sdcIL_stats(mdav(f1080_fRows, 3), f1080_fRows) def sdcIL_stats (x, xp, vDistance=fNorm): meanX = matrixColumnMeans(x) sdX = matrixColumnSD(x) meanXp = matrixColumnMeans(xp) sdXp = matrixColumnSD(xp) dMean = vDistance(meanX,meanXp) dSD = vDistance(sdX,sdXp) dMax = max(list(map(lambda x:max(x),np.subtract(x,xp)))) return (dMean, dSD, dMax) def sdcRegressionError (XTrIndep, yTrDep, XTeIndep, yTeDep, modelRegression): model = modelRegression() model.fit(XTrIndep, yTrDep) diff = (model.predict(XTeIndep) - yTeDep) myError = np.sqrt(sum(diff*diff)) score= model.score(XTeIndep, yTeDep) #print("Score="+str(score)+", myError="+str((myError/len(yTeDep)))) return(myError/len(yTeDep), diff) def sdcIL_mlRE (x, xp, y, testX, testY, modelRegression): """ Function: # Compare error of models for both original and masked file # for a given test set Input: # x: Input variables, Original file # xp: Input variables, Protected file # y: Output variables, for both original and protected (not modified) # testX: Input variable, records for testing # testY: Output variables, records for testing # modelRegression: """ originalError = sdcRegressionError (x, y, testX, testY, modelRegression) maskedError = sdcRegressionError (xp, y, testX, testY, modelRegression) return (maskedError[0] - originalError[0]) def sdcIL_mlREmY (x, xp, y, yp, testX, testY, modelRegression): """ Function: # Compare error of models for both original and masked file # for a given test set Input: # x: Input variables, Original file # xp: Input variables, Protected file # y: Output variables, for original # yp: Output variable, for protected # testX: Input variable, records for testing # testY: Output variables, records for testing # modelRegression: """ originalError = sdcRegressionError (x, y, testX, testY, modelRegression) maskedError = sdcRegressionError (xp, yp, testX, testY, modelRegression) return (maskedError[0] - originalError[0]) def sdcIL_mlRE_GivenOErr (oError, xp, y, testX, testY, modelRegression): """ Function: # Compare error of models for both original and masked file # for a given test set Input: # originalError: Original Error is given # xp: Input variables, Protected file # y: Output variables, for both original and protected (not modified) # testX: Input variable, records for testing # testY: Output variables, records for testing # modelRegression: """ maskedError = sdcRegressionError (xp, y, testX, testY, modelRegression) return (maskedError[0] - oError) ## def testSdcIL_ml (x, y, testX, testY, modelRegression, masking, loParameters): """ Function: # Test IL in terms of prediction error, for a masking method, several parameters Parameters: # x: Training data set, input variable # y: Training data set, output variable # testX: Test data set, input variable # testY: Test data set, output variable # masking (x,k): function that masks with parameter k # loParameters: list of parameters for masking """ originalError = sdcRegressionError (x, y, testX, testY, modelRegression) SDCerror = [] for k in loParameters: SDCerror.append(sdcIL_mlRE_GivenOErr (originalError[0], masking(x,k), y, testX, testY,sklearn.linear_model.LinearRegression)) print (k) return(SDCerror) # EXAMPLES: # from sklearn.kernel_ridge import KernelRidge # lp_mdav = [2,3,4,5,10,15,20,25,30,40,41,42,43,44,45,46,50,60,70,80,90,100] # DXErrorMDAV_LR = testSdcIL_mlRE (trDX, trDY, teDX, teDY,sklearn.linear_model.LinearRegression, mdav, lp_mdav) # DXErrorMDAV_SG = testSdcIL_mlRE (trDX, trDY, teDX, teDY,sklearn.linear_model.SGDRegressor, mdav, lp_mdav) # DXErrorMDAV_KR = testSdcIL_mlRE (trDX, trDY, teDX, teDY,sklearn.kernel_ridge.KernelRidge, mdav, lp_mdav) # DXErrorMDAV_SV = testSdcIL_mlRE (trDX, trDY, teDX, teDY,sklearn.svm.SVR, mdav, lp_mdav) ## DR: Disclosure Risk ------------------------------------------ ## sdcRecordLinkage (mdav(f1080_fRows, 3), f1080_fRows) def sdcRecordLinkage (x, xp, vDistance=fNorm2): match = 0 for i in range(0,len(x)): iMin = 0 dMin = vDistance(x[i],xp[iMin]) for j in range(1,len(xp)): d = vDistance(x[i],xp[j]) if (d < dMin): dMin = d iMin = j if (iMin==i): match = match + 1 return (match, len(x)) ## Function: ## (1) Compute an assignment of x to xp, ## (2) the number of correct matches (assumed x and xp aligned) ## (3) length of x ## Example: ## f1080_fRows ## f1080p = noiseAddition (f1080_fRows, 2) ## sdcRecordLinkageLoL(f1080_fRows[1:10],f1080p[1:10], [1],[1]) ## iris_fRows ## irisp = noiseAddition(iris_fRows,1) ## sdcRecordLinkageLoL(iris_fRows[1:10],irisp[1:10], [1],[1]) ## sdcRecordLinkageLoL(iris_fRows[1:10],irisp[1:10], [1,2],[1,2]) def sdcRecordLinkageLoL (x, xp, varsX, varsXp, vDistance=fNorm2): lol = [] match = 0 for i in range(0,len(x)): iMin = 0 dMin = vDistance(listSelection(x[i],varsX),listSelection(xp[iMin],varsXp)) for j in range(1,len(xp)): d = vDistance(listSelection(x[i],varsX),listSelection(xp[j],varsXp)) if (d < dMin): dMin = d iMin = j if (iMin==i): match = match + 1 lol.append(iMin) return (lol,match, len(x)) def testSdcRL (x, masking, loParameters): """ Function: # Test Record Linkage Parameters: # x: Training data set, input variable # testX: Test data set, input variable # masking (x,k): function that masks with parameter k # loParameters: list of parameters for masking """ RL = [] for k in loParameters: RL.append(sdcRecordLinkage (x, masking(x,k))) print (k) return(RL) ## DXRL_MDAV = testSdcRL (trDX, mdav, lp_mdav) ## fheader, frows = readNumericalFileWithHeader("Concrete_DataVarsV1V7.csv") ## ## resRL = [(sdcRecordLinkage (mdav(fRows, k), fRows)[0])/1030.0 for k in range(1,5)] ## resIL = [sdcIL_stats(mdav(fRows, k), fRows)[1] for k in range(1,5)] ## resRLmondrian = [(sdcRecordLinkage (mondrian(fRows, k), fRows)[0])/1030.0 for k in range(1,5)] ## resILmondrian = [sdcIL_stats(mondrian(fRows, k), fRows)[1] for k in range(1,5)] def mondrian (db, k): assignedCl = mondrianCl(db,k) values = set(assignedCl) centroids = [] for cl in values: rowsCl = selectFv(db,assignedCl, lambda x:x==cl) meanRow = matrixColumnMeans(rowsCl) centroids.append(meanRow) newDb = [centroids[assignedCl[i]] for i in range(0,len(db))] return(newDb) def mondrianCl (db, k): clId, assignedCl = mondrianWithUnassigned (db, k, [-1]*len(db), 0, -1) return assignedCl def mondrianWithUnassigned (db, k, assignedCl=None, firstNCl = 0, stillToProcess = -1): if assignedCl==None: assignedCl = [-1]*len(db) # print(assignedCl) unassigned = selectFv(db,assignedCl, lambda x:x==stillToProcess) if len(unassigned) < 2*k: index2Update = list(filter(lambda i: assignedCl[i]==stillToProcess, range(0,len(assignedCl)))) newClNumber = firstNCl newAssignedCl = updateCl(index2Update, newClNumber, assignedCl) return (firstNCl+1, newAssignedCl) else: loMaxMinusMin = matrixColumnFunction (unassigned, lambda x:(max(x)-min(x))) selectedAttribute = loMaxMinusMin.index(max(loMaxMinusMin)) cutPoint = partitionAttributeValue (unassigned, selectedAttribute) iLow, iHigh = indexLowMidHighValues(db, assignedCl, stillToProcess, cutPoint, selectedAttribute) clLowValues = stillToProcess*2 newAssignedCl = updateCl(iLow, clLowValues, assignedCl) new1FirstNCl, new1AssignedCl = mondrianWithUnassigned(db, k, newAssignedCl, firstNCl, clLowValues) clHighValues = stillToProcess*2+1 allAssignedCl = updateCl(iHigh, clHighValues, newAssignedCl) new2FirstNCl, new2AssignedCl = mondrianWithUnassigned(db, k, new1AssignedCl, new1FirstNCl, clHighValues) return (new2FirstNCl, new2AssignedCl) ## mondrianWithUnassigned([[100],[100],[100],[200],[300],[400],[500],[600],[650],[600],[700],[700]],3, [-1]*12) def partitionAttributeValue(unassigned, selectedAttribute): allValues = list(map(lambda x:x[selectedAttribute], unassigned)) allValues.sort() midValue = allValues[len(allValues)//2] # midValue = (allValues[0]+allValues[len(allValues)-1])/2 return(midValue) def indexLowMidHighValues(db, assignedCl, idCl, midValue, selectedAttribute): indexLowValues = list(filter(lambda i: assignedCl[i]==idCl and db[i][selectedAttribute] < midValue, range(0,len(db)))) indexMidValues = list(filter(lambda i: assignedCl[i]==idCl and db[i][selectedAttribute] == midValue, range(0,len(db)))) indexHighValues = list(filter(lambda i: assignedCl[i]==idCl and db[i][selectedAttribute] > midValue, range(0,len(db)))) numberValues = len(indexLowValues) + len(indexMidValues) + len(indexHighValues) toLow = max(numberValues//2 - len(indexLowValues),0) # print(midValue) # print(toLow) indexLow = indexLowValues + indexMidValues[0:toLow] indexHigh = indexMidValues[toLow:] + indexHighValues return(indexLow,indexHigh) ## indexLowMidHighValues([[1],[1],[1],[1],[1],[4]],[-1]*6,-1,1,0) ## indexLowMidHighValues(mat, [-1,-1,-1,-1], -1, 2, 0) ## mondrianWithUnassigned([[100],[100],[100],[200],[300],[400],[500],[600],[650],[600],[700],[700]],3, [-1]*12)