library("readr") library("hash") library("stringr") ################################ # parameters # A set of parameters are defined so this converter can be adapted to different cases. # We must specify what information will be taken from the file in CSV format, # where we will leave the result and the graph type # ################################ source <- 3 # Column number of the origin of the relationship target <- 9 # Column number of the destination of the relationship source_attribs <- c(5,6,11,12,13) # Column numbers where to get the attributes null_attrib <- c(NA,NA,NA,NA,NA) # Default value for entities without attributes name_attribs <- c("app", "location", "hastag", "lang", "create at") # Names of the attributes name_file_csv <- "rstudio_RTs.csv" # Name of the input file in CSV format name_file_gdf <- "rstudio_RTs.gdf" # Name of the output file in GDF format directed <- TRUE # Indicates whether the graph is directed or not ############################### # Data # # In order to store the data, dynamic structures are needed to allow data to be # added as they appear. The hash tables were chosen because they are the most appropriate # for this case. Hash tables will be used to store nodes and connections ############################### hash_nodes <- hash() hash_links <- hash() hash_links_in <- hash() hash_links_out <- hash() hash_connections <- hash() hash_connections_attrib <- hash() num_attribs <- length(source_attribs) ############################### # read source data # # Import data reading the CSV file and run it row by row to store the nodes and connections # in the hash tables. # # Related entities can appear multiple times, as a source or as a target. When an entity appears # for the first time, it is stored in the hash_nodes table. Attributes are associated to source # entities and null attributes to target entities. It is a criterion that assumes this algorithm, # but there could be others. If an entity appears the first time as a target, it will be assigned # the null attributes, but if it appears later as a source, the null attributes will be replaced # by theirs # # For each entity, the number of total links (hash_links), the number of inbound links (hash_links_in) # and the number of outbound links (hash_links) are counted. This is done to allow ordering the nodes # from greater to lesser degree when generating the file in GDF format. # # For each origin-target entity pair, the number of times that the relation appears (hash_connections) # and the attributes (hash_connections_attrib) are stored. In the first case, we get the weight of # the relationship and, in the second, we get the associated attributes. ############################### table_csv <- read_csv2(name_file_csv) num_rows <- nrow(table_csv) num_cols <- ncol(table_csv) for (i in 1:num_rows) { node_source <- table_csv[[i,source]] node_target <- table_csv[[i,target]] print (i) node_source_attribs<-null_attrib # The attributes are stored and the ',' character is changed by '-'. to avoid confict in the GDF format for (j in 1:num_attribs) { k <- source_attribs[[j]] raw_attrib <- table_csv[[i,k]] cooked_attrib <- str_replace_all(raw_attrib,",","-") node_source_attribs[[j]] <- cooked_attrib } # If a source node appears for the first time, store with attributes if (!(has.key(node_source, hash_nodes))) { hash_nodes[[node_source]] <- node_source_attribs hash_links[[node_source]] <- 0 hash_links_in[[node_source]] <- 0 hash_links_out[[node_source]] <- 0 } # If the source node exists and has null attributes, store its own else { node_source_attribs_old=hash_nodes[[node_source]] if (identical(node_source_attribs_old,null_attrib)) {hash_nodes[[node_source]] <- node_source_attribs} } # Check that target node exists if (!(is.na(node_target))) { # If a target node appears for the first time, store with null attributes if (!(has.key(node_target, hash_nodes))) { hash_nodes[[node_target]] <- null_attrib hash_links[[node_target]] <- 0 hash_links_in[[node_target]] <- 0 hash_links_out[[node_target]] <- 0 } #Store connections par_nodes=paste(node_source,node_target) # If a pair of nodes appears for the first time related, store relation if (!(has.key(par_nodes, hash_connections))) { hash_connections[[par_nodes]] <- 0 hash_connections_attrib[[par_nodes]] <- node_source_attribs } # In all cases, increase the number of connections hash_connections[[par_nodes]] <- hash_connections[[par_nodes]] +1 hash_links[[node_source]] <- hash_links[[node_source]]+1 hash_links_out[[node_source]] <- hash_links_out[[node_source]] +1 hash_links[[node_target]] <- hash_links[[node_target]]+1 hash_links_in[[node_target]] <- hash_links_in[[node_target]] +1 } } ################################## # Order descending by connections # # The hash table object does not have the sort method, but has one to convert it into a list. # # Once we have converted the hash_links and hash_connections into a list, we sort them down # by number of connections ################################## list_links <- as.list.hash(hash_links ) list_link_order <- list_links[order(unlist(list_links), decreasing=TRUE)] list_connections <- as.list.hash(hash_connections ) list_connections_order <- list_connections[order(unlist(list_connections), decreasing=TRUE)] ################################## # Prepare the data for the GDF format # # In this step we place in GDF format nodes and links in descending order by number # of connections. # # In the GDF format, the only data required for the definition of nodes is the name of the node, # but attributes can be added. In this case, three fixed attributes are included, which are the # total number of links, the number of inbound links and the number of outbound links. Since the # GDF format is readable, these attributes allow getting an idea of the most relevant nodes even # before importing them into Gephi. The attributes configured in the parameters are also added. # The information of the nodes is stored in a matrix sized in rows by the number of nodes and in columns # by the number of attributes configured plus four. # # For the definition of links only the source and target nodes are required, but we can also expand # them with attributes. In this case we add the weight of the relation, a boolean variable to indicate # if the graph is directed or not (by default it is not directed) and the attributes configured # in the parameters. # The information of the links is stored in a matrix dimensioned in rows by the number of pairs of # connections and in columns by the number of attributes configured plus four. ################################# # Definition of nodes num_nodes=length(list_links) table_nodes <- matrix(nrow=num_nodes,ncol=num_attribs+4) num_nodes_connected <- 0 for(i in 1:num_nodes) { name_node=names(list_link_order)[i] if (hash_links[[name_node]] > 0) { num_nodes_connected <- num_nodes_connected+1 table_nodes[i,1] <- name_node table_nodes[i,2] <- hash_links[[name_node]] table_nodes[i,3] <- hash_links_in[[name_node]] table_nodes[i,4] <- hash_links_out[[name_node]] node_attrib <- hash_nodes[[name_node]] for (j in 1:num_attribs) { table_nodes[i,4+j] <- node_attrib[[j]] } } } # Only connected nodes are considered k <- num_attribs+4 table_nodes <- table_nodes[1:num_nodes_connected, 1:k] # Definition of links num_connections <- length(list_connections) table_connections <- matrix(nrow=num_connections,ncol=num_attribs+4) for(i in 1:num_connections) { name_conexion <- names(list_connections_order)[i] source_target <- strsplit(name_conexion," ") table_connections[i,1] <- source_target[[1]][1] table_connections[i,2] <- source_target[[1]][2] table_connections[i,3] <- list_connections_order[[i]] table_connections[i,4] <- directed connection_attrib <- hash_connections_attrib[[name_conexion]] for (j in 1:num_attribs) { table_connections[i,4+j] <- connection_attrib[[j]] } } ################################# # Generate the file in GDF format # # The last step is to write the file in GDF format. We will only have to add the headers # before writing the information of the nodes and links. ################################# # Definition of nodes head_nodes <- "nodedef>name VARCHAR,links INT,Links_in INT,links_out INT" for (j in 1:num_attribs) { attrib_type <- paste(name_attribs[[j]],"VARCHAR",sep = " ") head_nodes <- paste(head_nodes,attrib_type,sep = ",") } write.table(head_nodes, file = name_file_gdf, append = FALSE, quote = FALSE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = FALSE, qmethod = c("escape", "double"), fileEncoding = "UTF-8") write.table(table_nodes, file = name_file_gdf, append = TRUE, quote = FALSE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = FALSE, qmethod = c("escape", "double"), fileEncoding = "UTF-8") # Definition of links head_arcs<-"edgedef>node1 VARCHAR,node2 VARCHAR, weight INT, directed BOOLEAN" for (j in 1:num_attribs) { attrib_type <- paste(name_attribs[[j]],"VARCHAR",sep = " ") head_arcs <- paste( head_arcs,attrib_type,sep = ",") } write.table(head_arcs, file = name_file_gdf, append = TRUE, quote = FALSE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = FALSE, qmethod = c("escape", "double"), fileEncoding = "UTF-8") write.table(table_connections, file = name_file_gdf, append = TRUE, quote = FALSE, sep = ",", eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = FALSE, qmethod = c("escape", "double"), fileEncoding = "UTF-8")